xref: /aosp_15_r20/external/libavc/decoder/x86/svc/isvcd_intra_resamp_sse42.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21  *******************************************************************************
22  * @file
23  *  isvcd_intra_resamp_sse42.c
24  *
25  * @brief
26  *  Contains function definitions for intra resampling functions
27  *
28  * @author
29  * Kishore
30  *
31  * @par List of Functions:
32  *  - isvcd_interpolate_base_luma_dyadic_sse42
33  *  - isvcd_vert_interpol_chroma_dyadic_1_sse42
34  *  - isvcd_vert_interpol_chroma_dyadic_2_sse42
35  *  - isvcd_vert_interpol_chroma_dyadic_3_sse42
36  *  - isvcd_horz_interpol_chroma_dyadic_1_sse42
37  *  - isvcd_horz_interpol_chroma_dyadic_2_sse42
38  *
39  * @remarks
40  *  None
41  *
42  *******************************************************************************
43  */
44 #include <immintrin.h>
45 #include <smmintrin.h>
46 #include <emmintrin.h>
47 /* User include files */
48 #include "ih264_typedefs.h"
49 #include "isvcd_structs.h"
50 
51 /*****************************************************************************/
52 /*                                                                           */
53 /*  Function Name : isvcd_interpolate_base_luma_dyadic_sse42                  */
54 /*                                                                           */
55 /*  Description   : This function takes the reference array buffer & performs*/
56 /*                  intra resampling for dyadic scaling ratios               */
57 /*  Inputs        : pu1_inp_buf : ptr to the 12x12 reference sample buffer   */
58 /*                    pi2_tmp_filt_buf : ptr to the 12x16 buffer to hold the */
59 /*                        vertically interpolated data                       */
60 /*                  pu1_out_buf : output buffer pointer                      */
61 /*                  i4_out_stride : output buffer stride                     */
62 /*  Globals       : none                                                     */
63 /*  Processing    : it does the interpolation in vertical direction followed */
64 /*                  by horizontal direction                                  */
65 /*  Outputs       : resampled pixels                                         */
66 /*  Returns       : none                                                     */
67 /*                                                                           */
68 /*  Issues        : none                                                     */
69 /*                                                                           */
70 /*  Revision History:                                                        */
71 /*                                                                           */
72 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
73 /*         05 21 2021   Dolan          creation                              */
74 /*                                                                           */
75 /*****************************************************************************/
76 
isvcd_interpolate_base_luma_dyadic_sse42(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride)77 void isvcd_interpolate_base_luma_dyadic_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
78                                               UWORD8 *pu1_out_buf, WORD32 i4_out_stride)
79 {
80     WORD32 i4_x, i4_y;
81     WORD32 i4_filt_stride, i4_src_stride;
82     UWORD8 *pu1_inp, *pu1_out;
83     WORD16 *pi2_tmp;
84 
85     __m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3;
86     __m128i i4_samp_8x16b_0, i4_samp_8x16b_1, i4_samp_8x16b_2, i4_samp_8x16b_3;
87     __m128i i4_res_8x16b_r1_1, i4_res_8x16b_r1_2, i4_res_8x16b_r1_3;
88     __m128i i4_res_8x16b_r2_1, i4_res_8x16b_r2_2, i4_res_8x16b_r2_3;
89 
90     /* Filter coefficient values for phase 4 */
91     __m128i i4_coeff_8x16b_0 = _mm_set1_epi16(-3);
92     __m128i i4_coeff_8x16b_1 = _mm_set1_epi16(28);
93     i4_filt_stride = 12;
94     i4_src_stride = DYADIC_REF_W_Y;
95 
96     pu1_inp = pu1_inp_buf;
97     pi2_tmp = pi2_tmp_filt_buf;
98     pu1_out = pu1_out_buf;
99 
100     /* Vertical interpolation */
101     /*First 64 bit */
102     for(i4_x = 0; i4_x < 1; i4_x++)
103     {
104         /* y = 0, y_phase = 12 */
105         i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
106         i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
107         i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
108         i4_samp_16x8b_3 =
109             _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
110         pu1_inp += (i4_src_stride << 2);
111         i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
112         i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
113         i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
114         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
115 
116         /* since y_phase 12 for y = 0 */
117         /*Multiply by 8 =>  left shift by 3*/
118         i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
119         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
120         i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
121 
122         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
123         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
124         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
125 
126         _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
127         pi2_tmp += i4_filt_stride;
128 
129         i4_samp_8x16b_0 = i4_samp_8x16b_1;
130         i4_samp_8x16b_1 = i4_samp_8x16b_2;
131         i4_samp_8x16b_2 = i4_samp_8x16b_3;
132         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
133 
134         /* y_phase is 4 for odd values of y */
135         /* and 12 for even values of y        */
136         /*Multiply by 8 =>  left shift by 3*/
137 
138         i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
139         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
140         i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
141 
142         i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
143         i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
144         i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
145 
146         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
147         i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
148 
149         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
150         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
151 
152         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
153         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
154 
155         /* Storing the results */
156         _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
157         _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
158         pi2_tmp += (i4_filt_stride << 1);
159         pu1_inp += i4_src_stride;
160 
161         i4_samp_8x16b_0 = i4_samp_8x16b_1;
162         i4_samp_8x16b_1 = i4_samp_8x16b_2;
163         i4_samp_8x16b_2 = i4_samp_8x16b_3;
164         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
165 
166         /* y_phase is 4 for odd values of y */
167         /* and 12 for even values of y        */
168         /*Multiply by 8 =>  left shift by 3*/
169 
170         i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
171         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
172         i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
173 
174         i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
175         i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
176         i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
177 
178         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
179         i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
180 
181         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
182         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
183 
184         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
185         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
186 
187         /* Storing the results */
188         _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
189         _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
190         pi2_tmp += (i4_filt_stride << 1);
191         pu1_inp += i4_src_stride;
192 
193         i4_samp_8x16b_0 = i4_samp_8x16b_1;
194         i4_samp_8x16b_1 = i4_samp_8x16b_2;
195         i4_samp_8x16b_2 = i4_samp_8x16b_3;
196         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
197 
198         /* y_phase is 4 for odd values of y */
199         /* and 12 for even values of y        */
200         /*Multiply by 8 =>  left shift by 3*/
201 
202         i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
203         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
204         i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
205 
206         i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
207         i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
208         i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
209 
210         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
211         i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
212 
213         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
214         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
215 
216         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
217         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
218 
219         /* Storing the results */
220         _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
221         _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
222         pi2_tmp += (i4_filt_stride << 1);
223         pu1_inp += i4_src_stride;
224 
225         i4_samp_8x16b_0 = i4_samp_8x16b_1;
226         i4_samp_8x16b_1 = i4_samp_8x16b_2;
227         i4_samp_8x16b_2 = i4_samp_8x16b_3;
228         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
229 
230         /* y_phase is 4 for odd values of y */
231         /* and 12 for even values of y        */
232         /*Multiply by 8 =>  left shift by 3*/
233         i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
234         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
235         i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
236 
237         i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
238         i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
239         i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
240 
241         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
242         i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
243 
244         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
245         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
246 
247         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
248         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
249 
250         /* Storing the results */
251         _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
252         _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
253         pi2_tmp += (i4_filt_stride << 1);
254         pu1_inp += i4_src_stride;
255 
256         i4_samp_8x16b_0 = i4_samp_8x16b_1;
257         i4_samp_8x16b_1 = i4_samp_8x16b_2;
258         i4_samp_8x16b_2 = i4_samp_8x16b_3;
259         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
260         /* y_phase is 4 for odd values of y */
261         /* and 12 for even values of y        */
262         /*Multiply by 8 =>  left shift by 3*/
263         i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
264         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
265         i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
266 
267         i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
268         i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
269         i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
270 
271         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
272         i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
273 
274         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
275         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
276 
277         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
278         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
279 
280         /* Storing the results */
281         _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
282         _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
283         pi2_tmp += (i4_filt_stride << 1);
284         pu1_inp += i4_src_stride;
285 
286         i4_samp_8x16b_0 = i4_samp_8x16b_1;
287         i4_samp_8x16b_1 = i4_samp_8x16b_2;
288         i4_samp_8x16b_2 = i4_samp_8x16b_3;
289         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
290         /* y_phase is 4 for odd values of y */
291         /* and 12 for even values of y        */
292         /*Multiply by 8 =>  left shift by 3*/
293 
294         i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
295         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
296         i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
297 
298         i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
299         i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
300         i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
301 
302         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
303         i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
304 
305         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
306         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
307 
308         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
309         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
310 
311         /* Storing the results */
312         _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
313         _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
314         pi2_tmp += (i4_filt_stride << 1);
315         pu1_inp += i4_src_stride;
316 
317         i4_samp_8x16b_0 = i4_samp_8x16b_1;
318         i4_samp_8x16b_1 = i4_samp_8x16b_2;
319         i4_samp_8x16b_2 = i4_samp_8x16b_3;
320         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
321         /* y_phase is 4 for odd values of y */
322         /* and 12 for even values of y        */
323         /*Multiply by 8 =>  left shift by 3*/
324 
325         i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
326         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
327         i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
328 
329         i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
330         i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
331         i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
332 
333         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
334         i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
335 
336         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
337         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
338 
339         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
340         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
341 
342         /* Storing the results */
343         _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
344         _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
345         pi2_tmp += (i4_filt_stride << 1);
346         pu1_inp += i4_src_stride;
347 
348         /* y = 15, y_phase = 4 */
349         i4_samp_8x16b_0 = i4_samp_8x16b_1;
350         i4_samp_8x16b_1 = i4_samp_8x16b_2;
351         i4_samp_8x16b_2 = i4_samp_8x16b_3;
352         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
353 
354         i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
355         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
356         i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
357         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
358 
359         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
360         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
361 
362         /* Store the output */
363         _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
364 
365         /* Reinitializing the ptrs */
366         pu1_inp = pu1_inp_buf;
367         pi2_tmp = pi2_tmp_filt_buf;
368     } /* End of loop over x */
369 
370     /*Remaining 32 bit */
371     pu1_inp += 8;
372     pi2_tmp += 8;
373     for(i4_x = 0; i4_x < 1; i4_x++)
374     {
375         /* y = 0, y_phase = 12 */
376         i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
377         i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
378         i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
379         i4_samp_16x8b_3 =
380             _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
381         pu1_inp += (i4_src_stride << 2);
382         i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
383         i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
384         i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
385         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
386 
387         /* since y_phase 12 for y = 0 */
388         /*Multiply by 8 =>  left shift by 3*/
389         i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
390         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
391         i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
392 
393         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
394         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
395         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
396 
397         _mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
398         pi2_tmp += i4_filt_stride;
399 
400         for(i4_y = 1; i4_y < 15; i4_y += 2)
401         {
402             i4_samp_8x16b_0 = i4_samp_8x16b_1;
403             i4_samp_8x16b_1 = i4_samp_8x16b_2;
404             i4_samp_8x16b_2 = i4_samp_8x16b_3;
405             i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
406             /* y_phase is 4 for odd values of y */
407             /* and 12 for even values of y        */
408             /*Multiply by 8 =>  left shift by 3*/
409 
410             i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
411             i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
412             i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
413 
414             i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
415             i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
416             i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
417 
418             i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
419             i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
420 
421             i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
422             i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
423 
424             i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
425             i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
426 
427             /* Storing the results */
428             _mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
429             _mm_storel_epi64((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
430             pi2_tmp += (i4_filt_stride << 1);
431             pu1_inp += i4_src_stride;
432         } /* End of loop over y */
433 
434         /* y = 15, y_phase = 4 */
435         i4_samp_8x16b_0 = i4_samp_8x16b_1;
436         i4_samp_8x16b_1 = i4_samp_8x16b_2;
437         i4_samp_8x16b_2 = i4_samp_8x16b_3;
438         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
439 
440         i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
441         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
442         i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
443         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
444 
445         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
446         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
447 
448         /* Store the output */
449         _mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
450 
451         /* Reinitializing the ptrs */
452         pu1_inp = pu1_inp_buf;
453         pi2_tmp = pi2_tmp_filt_buf;
454     }
455 
456     {
457         __m128i coeff_c0_c1_8x16b = _mm_set_epi16(28, -3, 28, -3, 28, -3, 28, -3);
458         __m128i coeff_c2_c3_8x16b = _mm_set_epi16(-1, 8, -1, 8, -1, 8, -1, 8);
459         __m128i coeff_c3_c2_8x16b = _mm_set_epi16(8, -1, 8, -1, 8, -1, 8, -1);
460         __m128i coeff_c1_c0_8x16b = _mm_set_epi16(-3, 28, -3, 28, -3, 28, -3, 28);
461 
462         __m128i i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart2_0;
463         __m128i i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart2_1;
464         __m128i i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart2_2;
465         __m128i i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart2_3;
466         __m128i i4_samp_8x16b_rpart1_4, i4_samp_8x16b_rpart2_4;
467 
468         __m128i i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart2_0;
469         __m128i i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart2_1;
470         __m128i i4_res_4x32b_rpart1_2, i4_res_4x32b_rpart2_2;
471         __m128i i4_res_4x32b_rpart1_3, i4_res_4x32b_rpart2_3;
472 
473         __m128i res_512 = _mm_set1_epi32(512);
474         /* Horizontal interpolation */
475         for(i4_y = 0; i4_y < 16; i4_y++)
476         {
477             // a0 a1 a2 a3 a4 a5 a6 a7
478             i4_samp_8x16b_rpart1_0 = _mm_loadu_si128((__m128i *) pi2_tmp);
479             // a4 a5 a6 a7 a8 a9 a10 a11
480             i4_samp_8x16b_rpart2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 4));
481             // a1 a2 a3 a4 a5 a6 a7 0
482             i4_samp_8x16b_rpart1_1 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 2);
483             // a2 a3 a4 a5 a6 a7 0 0
484             i4_samp_8x16b_rpart1_2 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 4);
485             // a3 a4 a5 a6 a7 0 0 0
486             i4_samp_8x16b_rpart1_3 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 6);
487             // a4 a5 a6 a7 0 0 0 0
488             i4_samp_8x16b_rpart1_4 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 8);
489 
490             // a5 a6 a7 a8 a9 a10 a11 0
491             i4_samp_8x16b_rpart2_1 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 2);
492             // a6 a7 a8 a9 a10 a11 0 0
493             i4_samp_8x16b_rpart2_2 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 4);
494             // a7 a8 a9 a10 a11 0 0 0
495             i4_samp_8x16b_rpart2_3 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 6);
496             // a8 a9 a10 a11 0 0 0 0
497             i4_samp_8x16b_rpart2_4 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 8);
498             // a0 a1  a1 a2  a2 a3  a3 a4
499             i4_samp_8x16b_rpart1_0 =
500                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart1_1);
501             // a1 a2  a2 a3  a3 a4  a4 a5
502             i4_samp_8x16b_rpart1_1 =
503                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart1_2);
504             // a2 a3  a3 a4  a4 a5  a5 a6
505             i4_samp_8x16b_rpart1_2 =
506                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart1_3);
507             // a3 a4  a4 a5  a5 a6  a6 a7
508             i4_samp_8x16b_rpart1_3 =
509                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart1_4);
510             // a4 a5  a5 a6  a6 a7  a7 a8
511             i4_samp_8x16b_rpart2_0 =
512                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_0, i4_samp_8x16b_rpart2_1);
513             // a5 a6  a6 a7  a7 a8  a8 a9
514             i4_samp_8x16b_rpart2_1 =
515                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_1, i4_samp_8x16b_rpart2_2);
516             // a6 a7  a7 a8  a8 a9  a9 a10
517             i4_samp_8x16b_rpart2_2 =
518                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_2, i4_samp_8x16b_rpart2_3);
519             // a7 a8  a8 a9  a9 a10 a10 a11
520             i4_samp_8x16b_rpart2_3 =
521                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_3, i4_samp_8x16b_rpart2_4);
522             // a0c3+a1c2  a1c3+a2c2  a2c3+a3c2  a3c3+a4c2
523             i4_res_4x32b_rpart1_0 = _mm_madd_epi16(i4_samp_8x16b_rpart1_0, coeff_c3_c2_8x16b);
524             // a2c1+a3c0  a3c1+a4c0  a4c1+a5c0  a5c1+a6c0
525             i4_res_4x32b_rpart1_2 = _mm_madd_epi16(i4_samp_8x16b_rpart1_2, coeff_c1_c0_8x16b);
526             // a1c0+a2c1  a2c0+a3c1  a3c0+a4c1  a4c0+a5c1
527             i4_res_4x32b_rpart1_1 = _mm_madd_epi16(i4_samp_8x16b_rpart1_1, coeff_c0_c1_8x16b);
528             // a3c2+a4c3  a5c2+a5c3  a5c2+a6c3  a6c2+a7c3
529             i4_res_4x32b_rpart1_3 = _mm_madd_epi16(i4_samp_8x16b_rpart1_3, coeff_c2_c3_8x16b);
530             // a4c3+a5c2  a5a3+a6c2  a6c3+a7c2  a7c3+a8c2
531             i4_res_4x32b_rpart2_0 = _mm_madd_epi16(i4_samp_8x16b_rpart2_0, coeff_c3_c2_8x16b);
532             // a6c1+a7c0  a7c1+a8c0  a8c1+a9c0  a9c1+a10c0
533             i4_res_4x32b_rpart2_2 = _mm_madd_epi16(i4_samp_8x16b_rpart2_2, coeff_c1_c0_8x16b);
534             // a5c0+a6c1  a6c0+a7c1  a7c0+a8c1  a8c0+a9c1
535             i4_res_4x32b_rpart2_1 = _mm_madd_epi16(i4_samp_8x16b_rpart2_1, coeff_c0_c1_8x16b);
536             // a7c2+a8c3  a8c2+a9c3  a9c2+a10c3  a10c2+a11c3
537             i4_res_4x32b_rpart2_3 = _mm_madd_epi16(i4_samp_8x16b_rpart2_3, coeff_c2_c3_8x16b);
538             // a0c3+a1c2 + a2c1+a3c0  a1c3+a2c2 + a3c1+a4c0 a2c3+a3c2 + a4c1+a5c0
539             // a3c3+a4c2 +a5c1+a6c0
540             i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_2);
541             // a1c0+a2c1 + a3c2+a4c3  a2c0+a3c1 + a5c2+a5c3 a3c0+a4c1 + a5c2+a6c3
542             // a4c0+a5c1 + a6c2+a7c3
543             i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart1_3);
544             // a4c3+a5c2 + a6c1+a7c0  a5a3+a6c2 + a7c1+a8c0 a6c3+a7c2 + a8c1+a9c0
545             // a7c3+a8c2+ a9c1+a10c0
546             i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_2);
547             // a5c0+a6c1 + a7c2+a8c3  a6c0+a7c1 + a8c2+a9c3 a7c0+a8c1 + a9c2+a10c3
548             // a8c0+a9c1 + a10c2+a11c3
549             i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_1, i4_res_4x32b_rpart2_3);
550 
551             i4_res_4x32b_rpart1_2 =
552                 _mm_unpacklo_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
553             i4_res_4x32b_rpart1_3 =
554                 _mm_unpackhi_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
555 
556             i4_res_4x32b_rpart2_2 =
557                 _mm_unpacklo_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
558             i4_res_4x32b_rpart2_3 =
559                 _mm_unpackhi_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
560 
561             i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_2, res_512);
562             i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_3, res_512);
563 
564             i4_res_4x32b_rpart1_0 = _mm_srai_epi32(i4_res_4x32b_rpart1_0, 10);
565             i4_res_4x32b_rpart1_1 = _mm_srai_epi32(i4_res_4x32b_rpart1_1, 10);
566 
567             i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_2, res_512);
568             i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_3, res_512);
569 
570             i4_res_4x32b_rpart2_0 = _mm_srai_epi32(i4_res_4x32b_rpart2_0, 10);
571             i4_res_4x32b_rpart2_1 = _mm_srai_epi32(i4_res_4x32b_rpart2_1, 10);
572 
573             _mm_storeu_si128(
574                 (__m128i *) pu1_out,
575                 _mm_packus_epi16(_mm_packus_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1),
576                                  _mm_packus_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1)));
577 
578             pi2_tmp += i4_filt_stride;
579             pu1_out += i4_out_stride;
580         } /* End of loop over y */
581     }
582 } /* isvcd_interpolate_base_luma_dyadic */
583 
584 /*****************************************************************************/
585 /*                                                                           */
586 /*  Function Name : isvcd_interpolate_intra_base_sse42                        */
587 /*                                                                           */
588 /*  Description   : This function takes the reference array buffer & performs*/
589 /*                    interpolation of a component to find the intra         */
590 /*                     resampled value                                       */
591 /*  Inputs        : pv_intra_samp_ctxt : intra sampling context              */
592 /*                  pu1_out : output buffer pointer                          */
593 /*                  i4_out_stride : output buffer stride                     */
594 /*                  i4_refarray_wd : reference array width                   */
595 /*                  i4_x_offset : offset in reference layer in horz direction*/
596 /*                  ps_coord : current mb co-ordinate                        */
597 /*                  i4_chroma_flag : chroma processing flag                  */
598 /*  Globals       : none                                                     */
599 /*  Processing    : it does the interpolation in vertical direction followed */
600 /*                  by horizontal direction                                  */
601 /*  Outputs       : resampled pixels                                         */
602 /*  Returns       : none                                                     */
603 /*                                                                           */
604 /*  Issues        : none                                                     */
605 /*                                                                           */
606 /*  Revision History:                                                        */
607 /*                                                                           */
608 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
609 /*         06 09 2021   Kishore              creation                        */
610 /*                                                                           */
611 /*****************************************************************************/
isvcd_interpolate_intra_base_sse42(void * pv_intra_samp_ctxt,UWORD8 * pu1_out,WORD32 i4_out_stride,WORD32 i4_refarray_wd,WORD32 i4_mb_x,WORD32 i4_mb_y,WORD32 i4_chroma_flag,WORD32 i4_refarray_flag)612 void isvcd_interpolate_intra_base_sse42(void *pv_intra_samp_ctxt, UWORD8 *pu1_out,
613                                         WORD32 i4_out_stride, WORD32 i4_refarray_wd, WORD32 i4_mb_x,
614                                         WORD32 i4_mb_y, WORD32 i4_chroma_flag,
615                                         WORD32 i4_refarray_flag)
616 {
617     /* --------------------------------------------------------------------- */
618     /* Index Parameters                                                         */
619     /* --------------------------------------------------------------------- */
620     intra_sampling_ctxt_t *ps_ctxt;
621     intra_samp_map_ctxt_t *ps_map_ctxt;
622     intra_samp_lyr_ctxt *ps_lyr_ctxt;
623     WORD32 i4_x, i4_y;
624     WORD32 i4_frm_mb_x, i4_frm_mb_y;
625     UWORD8 *pu1_refarray = NULL;
626     ref_pixel_map_t *ps_x_pos_phase;
627     ref_pixel_map_t *ps_y_pos_phase;
628     WORD32 i4_temp_array_ht;
629     WORD32 *pi4_interp_buff;
630     WORD32 i4_mb_wd;
631     WORD32 i4_mb_ht;
632 
633     WORD32 i4_x_min;
634     ref_min_max_map_t *ps_x_min_max;
635     WORD8 arr_y_ref_pos_luma[16] = {0};
636     WORD8 arr_x_ref_pos_luma[16] = {0};
637     WORD8 arr_x_ref_pos_luma_low[16] = {0};
638     WORD8 arr_x_ref_pos_luma_high[16] = {0};
639     WORD8 arr_phase_luma[32] = {0};
640     WORD8 *pi4_y_ref_pos_luma;
641     WORD8 *pi4_x_ref_pos_luma_low;
642     WORD8 *pi4_x_ref_pos_luma_high;
643     WORD8 *pi4_phase_luma;
644     UWORD8 *pu1_refarray_temp;
645 
646     /* --------------------------------------------------------------------- */
647     /* Extracting pointers from the  context                                  */
648     /* --------------------------------------------------------------------- */
649     ps_ctxt = (intra_sampling_ctxt_t *) pv_intra_samp_ctxt;
650     ps_lyr_ctxt = &ps_ctxt->as_res_lyrs[ps_ctxt->i4_res_lyr_id];
651 
652     if(0 == i4_refarray_flag)
653     {
654         pu1_refarray = ps_ctxt->pu1_refarray_buffer;
655     }
656     else if(1 == i4_refarray_flag)
657     {
658         pu1_refarray = ps_ctxt->pu1_refarray_cb;
659     }
660 
661     /* --------------------------------------------------------------------- */
662     /* LUMA    or CHROMA */
663     /* --------------------------------------------------------------------- */
664 
665     if(1 == i4_chroma_flag)
666         ps_map_ctxt = &(ps_lyr_ctxt->s_chroma_map_ctxt);
667     else
668         ps_map_ctxt = &(ps_lyr_ctxt->s_luma_map_ctxt);
669 
670     i4_mb_wd = MB_WIDTH >> i4_chroma_flag;
671     i4_mb_ht = MB_HEIGHT >> i4_chroma_flag;
672 
673     ps_x_min_max = ps_map_ctxt->ps_x_min_max;
674 
675     i4_frm_mb_y = i4_mb_y * i4_mb_ht;
676     i4_frm_mb_x = i4_mb_x * i4_mb_wd;
677     /* get the min position */
678     i4_x_min = ps_x_min_max[i4_mb_x].i2_min_pos;
679 
680     /* --------------------------------------------------------------------- */
681     /* Projected frame level pointers                                        */
682     /* --------------------------------------------------------------------- */
683     ps_x_pos_phase = ps_map_ctxt->ps_x_pos_phase;
684     ps_y_pos_phase = ps_map_ctxt->ps_y_pos_phase;
685 
686     /* --------------------------------------------------------------------- */
687     /* Pointers and Dimenstion of the temporary buffer                         */
688     /* --------------------------------------------------------------------- */
689     i4_temp_array_ht = i4_mb_ht;
690     pi4_interp_buff = ps_ctxt->pi4_temp_interpolation_buffer;
691 
692     if(i4_chroma_flag == 0)
693     {
694         /* --------------------------------------------------------------------- */
695         /* Loop for interpolation in vertical direction */
696         /* --------------------------------------------------------------------- */
697         WORD16 *pi2_interp_buff_temp;
698         pi2_interp_buff_temp = (WORD16 *) pi4_interp_buff;
699         {
700             __m128i out_res_8x16b_0, out_res_8x16b_1;
701 
702             __m128i inp_8x16b_r0, inp_8x16b_r01_0, phs_mask_16x8b_r0, phs_mask_16x8b_r01_0,
703                 out_res_8x16b_r01_0;
704             __m128i inp_8x16b_r1, inp_8x16b_r23_0, phs_mask_16x8b_r1, phs_mask_16x8b_r23_0,
705                 out_res_8x16b_r01_1;
706             __m128i inp_8x16b_r2, inp_8x16b_r01_1, phs_mask_16x8b_r2, phs_mask_16x8b_r01_1,
707                 out_res_8x16b_r23_0;
708             __m128i inp_8x16b_r3, inp_8x16b_r23_1, phs_mask_16x8b_r3, phs_mask_16x8b_r23_1,
709                 out_res_8x16b_r23_1;
710 
711             for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
712             {
713                 arr_phase_luma[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
714                 arr_y_ref_pos_luma[i4_y] = (WORD8) (ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos);
715             }
716             pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
717             pi4_phase_luma = arr_phase_luma;
718 
719             for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
720             {
721                 pu1_refarray_temp =
722                     pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
723                 inp_8x16b_r0 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp - i4_refarray_wd));
724                 inp_8x16b_r1 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp));
725                 inp_8x16b_r2 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp + i4_refarray_wd));
726                 inp_8x16b_r3 =
727                     _mm_loadu_si128((__m128i *) (pu1_refarray_temp + 2 * i4_refarray_wd));
728 
729                 inp_8x16b_r01_0 = _mm_unpacklo_epi8(inp_8x16b_r0, inp_8x16b_r1);
730                 inp_8x16b_r23_0 = _mm_unpacklo_epi8(inp_8x16b_r2, inp_8x16b_r3);
731                 inp_8x16b_r01_1 = _mm_unpackhi_epi8(inp_8x16b_r0, inp_8x16b_r1);
732                 inp_8x16b_r23_1 = _mm_unpackhi_epi8(inp_8x16b_r2, inp_8x16b_r3);
733 
734                 phs_mask_16x8b_r0 = _mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y]]);
735                 phs_mask_16x8b_r1 =
736                     _mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 16]);
737                 phs_mask_16x8b_r2 =
738                     _mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 32]);
739                 phs_mask_16x8b_r3 =
740                     _mm_set1_epi8(g_ai1_interp_filter_luma[pi4_phase_luma[i4_y] + 48]);
741 
742                 phs_mask_16x8b_r01_0 = _mm_unpacklo_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
743                 phs_mask_16x8b_r23_0 = _mm_unpacklo_epi8(phs_mask_16x8b_r2, phs_mask_16x8b_r3);
744                 phs_mask_16x8b_r01_1 = _mm_unpackhi_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
745                 phs_mask_16x8b_r23_1 = _mm_unpackhi_epi8(phs_mask_16x8b_r2, phs_mask_16x8b_r3);
746 
747                 out_res_8x16b_r01_0 = _mm_maddubs_epi16(inp_8x16b_r01_0, phs_mask_16x8b_r01_0);
748                 out_res_8x16b_r01_1 = _mm_maddubs_epi16(inp_8x16b_r01_1, phs_mask_16x8b_r01_1);
749                 out_res_8x16b_r23_0 = _mm_maddubs_epi16(inp_8x16b_r23_0, phs_mask_16x8b_r23_0);
750                 out_res_8x16b_r23_1 = _mm_maddubs_epi16(inp_8x16b_r23_1, phs_mask_16x8b_r23_1);
751 
752                 out_res_8x16b_0 = _mm_add_epi16(out_res_8x16b_r01_0, out_res_8x16b_r23_0);
753                 out_res_8x16b_1 = _mm_add_epi16(out_res_8x16b_r01_1, out_res_8x16b_r23_1);
754 
755                 _mm_storeu_si128(
756                     (__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
757                     out_res_8x16b_0);
758                 _mm_storeu_si128((__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) +
759                                               (i4_x_min - 1) + 8),
760                                  out_res_8x16b_1);
761             }
762         }
763         /* --------------------------------------------------------------------- */
764         /* Loop for interpolation in horizontal direction                         */
765         /* --------------------------------------------------------------------- */
766         {
767             WORD32 strt_indx = 10, strt_indx_h = 0;
768 
769             __m128i inp_8x16b_0;
770             __m128i inp_8x16b_1;
771 
772             __m128i phs_mask_16x8b_0;
773             __m128i phs_mask_16x8b_1;
774             __m128i x_ref_pos_luma_mask_r0_0, x_ref_pos_luma_mask_r0_1, x_ref_pos_luma_mask_r1_0,
775                 x_ref_pos_luma_mask_r1_1, x_ref_pos_luma_mask_r2_0, x_ref_pos_luma_mask_r2_1,
776                 x_ref_pos_luma_mask_r3_0, x_ref_pos_luma_mask_r3_1;
777 
778             __m128i inp_8x16b_2, inp_8x16b_3;
779 
780             WORD32 i4_x2 = 0;
781             WORD32 i4_mb_wd_hlf = (i4_mb_wd >> 1);
782             __m128i twos = _mm_set1_epi8(2);
783 
784             strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos - 1;
785             strt_indx_h = (ps_x_pos_phase[8 + i4_frm_mb_x].i2_ref_pos - strt_indx - 1);
786             for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
787             {
788                 arr_x_ref_pos_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
789                 arr_phase_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
790                 arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx - 1;
791             }
792 
793             for(i4_x = 0; i4_x < i4_mb_wd_hlf; i4_x++)
794             {
795                 i4_x2 = i4_x << 1;
796                 arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
797                 arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
798             }
799             for(i4_x = i4_mb_wd_hlf; i4_x < i4_mb_wd; i4_x++)
800             {
801                 i4_x2 = (i4_x - i4_mb_wd_hlf) << 1;
802                 arr_x_ref_pos_luma_high[i4_x2] = ((arr_x_ref_pos_luma[i4_x] - strt_indx_h) << 1);
803                 arr_x_ref_pos_luma_high[i4_x2 + 1] = arr_x_ref_pos_luma_high[i4_x2] + 1;
804             }
805             pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
806             pi4_x_ref_pos_luma_high = arr_x_ref_pos_luma_high;
807             pi4_phase_luma = arr_phase_luma;
808 
809             phs_mask_16x8b_0 = _mm_loadu_si128((__m128i *) (pi4_phase_luma));
810             phs_mask_16x8b_1 = _mm_loadu_si128((__m128i *) (pi4_phase_luma + 8));
811 
812             x_ref_pos_luma_mask_r0_0 = _mm_loadu_si128((__m128i *) (pi4_x_ref_pos_luma_low));
813             x_ref_pos_luma_mask_r0_1 = _mm_loadu_si128((__m128i *) (pi4_x_ref_pos_luma_high));
814             x_ref_pos_luma_mask_r1_0 = _mm_add_epi8(x_ref_pos_luma_mask_r0_0, twos);
815             x_ref_pos_luma_mask_r1_1 = _mm_add_epi8(x_ref_pos_luma_mask_r0_1, twos);
816             x_ref_pos_luma_mask_r2_0 = _mm_add_epi8(x_ref_pos_luma_mask_r1_0, twos);
817             x_ref_pos_luma_mask_r2_1 = _mm_add_epi8(x_ref_pos_luma_mask_r1_1, twos);
818             x_ref_pos_luma_mask_r3_0 = x_ref_pos_luma_mask_r0_0;
819             x_ref_pos_luma_mask_r3_1 = x_ref_pos_luma_mask_r0_1;
820 
821             {
822                 __m128i ip_filt_16x8b_r0, ip_filt_8x16b_r0_0, ip_filt_8x16b_r0_1,
823                     ip_filt_8x16b_r01_l_0, ip_filt_8x16b_r01_h_0;
824                 __m128i ip_filt_16x8b_r1, ip_filt_8x16b_r1_0, ip_filt_8x16b_r1_1,
825                     ip_filt_8x16b_r23_l_0, ip_filt_8x16b_r23_h_0;
826                 __m128i ip_filt_16x8b_r2, ip_filt_8x16b_r2_0, ip_filt_8x16b_r2_1,
827                     ip_filt_8x16b_r01_l_1, ip_filt_8x16b_r01_h_1;
828                 __m128i ip_filt_16x8b_r3, ip_filt_8x16b_r3_0, ip_filt_8x16b_r3_1,
829                     ip_filt_8x16b_r23_l_1, ip_filt_8x16b_r23_h_1;
830 
831                 __m128i inp_8x16b_r0_0, inp_8x16b_r2_0, inp_8x16b_r01_l_0, inp_8x16b_r01_h_0,
832                     out_res_4x32b_r01_l_0, out_res_4x32b_r01_h_0;
833                 __m128i inp_8x16b_r0_1, inp_8x16b_r2_1, inp_8x16b_r23_l_0, inp_8x16b_r23_h_0,
834                     out_res_4x32b_r01_l_1, out_res_4x32b_r01_h_1;
835                 __m128i inp_8x16b_r1_0, inp_8x16b_r3_0, inp_8x16b_r01_l_1, inp_8x16b_r01_h_1,
836                     out_res_4x32b_r23_l_0, out_res_4x32b_r23_h_0;
837                 __m128i inp_8x16b_r1_1, inp_8x16b_r3_1, inp_8x16b_r23_l_1, inp_8x16b_r23_h_1,
838                     out_res_4x32b_r23_l_1, out_res_4x32b_r23_h_1;
839 
840                 __m128i out_res_4x32b_l_0;
841                 __m128i out_res_4x32b_l_1;
842                 __m128i out_res_4x32b_h_0;
843                 __m128i out_res_4x32b_h_1;
844 
845                 __m128i out_res_8x16b_l;
846                 __m128i out_res_8x16b_h;
847 
848                 __m128i out_res_16x8b;
849                 __m128i const_512 = _mm_set1_epi32(512);
850 
851                 ip_filt_16x8b_r0 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma));
852                 ip_filt_16x8b_r1 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma + 16));
853                 ip_filt_16x8b_r2 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma + 32));
854                 ip_filt_16x8b_r3 = _mm_loadu_si128((__m128i *) (g_ai1_interp_filter_luma + 48));
855 
856                 ip_filt_8x16b_r0_0 =
857                     _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r0, phs_mask_16x8b_0));
858                 ip_filt_8x16b_r1_0 =
859                     _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r1, phs_mask_16x8b_0));
860                 ip_filt_8x16b_r2_0 =
861                     _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r2, phs_mask_16x8b_0));
862                 ip_filt_8x16b_r3_0 =
863                     _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r3, phs_mask_16x8b_0));
864 
865                 ip_filt_8x16b_r0_1 =
866                     _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r0, phs_mask_16x8b_1));
867                 ip_filt_8x16b_r1_1 =
868                     _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r1, phs_mask_16x8b_1));
869                 ip_filt_8x16b_r2_1 =
870                     _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r2, phs_mask_16x8b_1));
871                 ip_filt_8x16b_r3_1 =
872                     _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r3, phs_mask_16x8b_1));
873 
874                 ip_filt_8x16b_r01_l_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
875                 ip_filt_8x16b_r23_l_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r2_0, ip_filt_8x16b_r3_0);
876                 ip_filt_8x16b_r01_l_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
877                 ip_filt_8x16b_r23_l_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r2_0, ip_filt_8x16b_r3_0);
878 
879                 ip_filt_8x16b_r01_h_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r0_1, ip_filt_8x16b_r1_1);
880                 ip_filt_8x16b_r23_h_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r2_1, ip_filt_8x16b_r3_1);
881                 ip_filt_8x16b_r01_h_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r0_1, ip_filt_8x16b_r1_1);
882                 ip_filt_8x16b_r23_h_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r2_1, ip_filt_8x16b_r3_1);
883 
884                 for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
885                 {
886                     inp_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_interp_buff_temp + strt_indx));
887                     inp_8x16b_1 = _mm_loadu_si128(
888                         (__m128i *) (pi2_interp_buff_temp + strt_indx + strt_indx_h));
889                     inp_8x16b_2 =
890                         _mm_loadu_si128((__m128i *) (pi2_interp_buff_temp + strt_indx + 3));
891                     inp_8x16b_3 = _mm_loadu_si128(
892                         (__m128i *) (pi2_interp_buff_temp + strt_indx + strt_indx_h + 3));
893                     pi2_interp_buff_temp += i4_refarray_wd;
894 
895                     inp_8x16b_r0_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r0_0);
896                     inp_8x16b_r0_1 = _mm_shuffle_epi8(inp_8x16b_1, x_ref_pos_luma_mask_r0_1);
897                     inp_8x16b_r1_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r1_0);
898                     inp_8x16b_r1_1 = _mm_shuffle_epi8(inp_8x16b_1, x_ref_pos_luma_mask_r1_1);
899 
900                     inp_8x16b_r2_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r2_0);
901                     inp_8x16b_r2_1 = _mm_shuffle_epi8(inp_8x16b_1, x_ref_pos_luma_mask_r2_1);
902                     inp_8x16b_r3_0 = _mm_shuffle_epi8(inp_8x16b_2, x_ref_pos_luma_mask_r3_0);
903                     inp_8x16b_r3_1 = _mm_shuffle_epi8(inp_8x16b_3, x_ref_pos_luma_mask_r3_1);
904 
905                     inp_8x16b_r01_l_0 = _mm_unpacklo_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
906                     inp_8x16b_r23_l_0 = _mm_unpacklo_epi16(inp_8x16b_r2_0, inp_8x16b_r3_0);
907                     inp_8x16b_r01_l_1 = _mm_unpackhi_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
908                     inp_8x16b_r23_l_1 = _mm_unpackhi_epi16(inp_8x16b_r2_0, inp_8x16b_r3_0);
909 
910                     inp_8x16b_r01_h_0 = _mm_unpacklo_epi16(inp_8x16b_r0_1, inp_8x16b_r1_1);
911                     inp_8x16b_r23_h_0 = _mm_unpacklo_epi16(inp_8x16b_r2_1, inp_8x16b_r3_1);
912                     inp_8x16b_r01_h_1 = _mm_unpackhi_epi16(inp_8x16b_r0_1, inp_8x16b_r1_1);
913                     inp_8x16b_r23_h_1 = _mm_unpackhi_epi16(inp_8x16b_r2_1, inp_8x16b_r3_1);
914 
915                     out_res_4x32b_r01_l_0 =
916                         _mm_madd_epi16(inp_8x16b_r01_l_0, ip_filt_8x16b_r01_l_0);
917                     out_res_4x32b_r01_l_1 =
918                         _mm_madd_epi16(inp_8x16b_r01_l_1, ip_filt_8x16b_r01_l_1);
919                     out_res_4x32b_r23_l_0 =
920                         _mm_madd_epi16(inp_8x16b_r23_l_0, ip_filt_8x16b_r23_l_0);
921                     out_res_4x32b_r23_l_1 =
922                         _mm_madd_epi16(inp_8x16b_r23_l_1, ip_filt_8x16b_r23_l_1);
923 
924                     out_res_4x32b_r01_h_0 =
925                         _mm_madd_epi16(inp_8x16b_r01_h_0, ip_filt_8x16b_r01_h_0);
926                     out_res_4x32b_r01_h_1 =
927                         _mm_madd_epi16(inp_8x16b_r01_h_1, ip_filt_8x16b_r01_h_1);
928                     out_res_4x32b_r23_h_0 =
929                         _mm_madd_epi16(inp_8x16b_r23_h_0, ip_filt_8x16b_r23_h_0);
930                     out_res_4x32b_r23_h_1 =
931                         _mm_madd_epi16(inp_8x16b_r23_h_1, ip_filt_8x16b_r23_h_1);
932 
933                     out_res_4x32b_l_0 = _mm_add_epi32(out_res_4x32b_r01_l_0, out_res_4x32b_r23_l_0);
934                     out_res_4x32b_l_1 = _mm_add_epi32(out_res_4x32b_r01_l_1, out_res_4x32b_r23_l_1);
935                     out_res_4x32b_h_0 = _mm_add_epi32(out_res_4x32b_r01_h_0, out_res_4x32b_r23_h_0);
936                     out_res_4x32b_h_1 = _mm_add_epi32(out_res_4x32b_r01_h_1, out_res_4x32b_r23_h_1);
937 
938                     out_res_4x32b_l_0 =
939                         _mm_srai_epi32(_mm_add_epi32(out_res_4x32b_l_0, const_512), 10);
940                     out_res_4x32b_l_1 =
941                         _mm_srai_epi32(_mm_add_epi32(out_res_4x32b_l_1, const_512), 10);
942                     out_res_4x32b_h_0 =
943                         _mm_srai_epi32(_mm_add_epi32(out_res_4x32b_h_0, const_512), 10);
944                     out_res_4x32b_h_1 =
945                         _mm_srai_epi32(_mm_add_epi32(out_res_4x32b_h_1, const_512), 10);
946 
947                     out_res_8x16b_l = _mm_packs_epi32(out_res_4x32b_l_0, out_res_4x32b_l_1);
948                     out_res_8x16b_h = _mm_packs_epi32(out_res_4x32b_h_0, out_res_4x32b_h_1);
949 
950                     out_res_16x8b = _mm_packus_epi16(out_res_8x16b_l, out_res_8x16b_h);
951                     _mm_storeu_si128((__m128i *) (pu1_out + (i4_y * i4_out_stride)), out_res_16x8b);
952                 }
953             }
954         }
955     }
956     else
957     {
958         WORD16 *pi2_interp_buff_temp;
959         pi2_interp_buff_temp = (WORD16 *) pi4_interp_buff;
960 
961         {
962             __m128i inp_8x16b_r0, inp_8x16b_r01_0, phs_mask_16x8b_r0, phs_mask_16x8b_r01_0,
963                 out_res_8x16b_r01_0;
964             __m128i inp_8x16b_r1, phs_mask_16x8b_r1, out_res_8x16b_r01_1;
965             __m128i inp_8x16b_r01_1, phs_mask_16x8b_r01_1;
966 
967             for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
968             {
969                 arr_y_ref_pos_luma[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_ref_pos;
970                 arr_phase_luma[i4_y] = (WORD8) ps_y_pos_phase[i4_y + i4_frm_mb_y].i2_phase;
971             }
972             pi4_y_ref_pos_luma = arr_y_ref_pos_luma;
973             pi4_phase_luma = arr_phase_luma;
974 
975             for(i4_y = 0; i4_y < (i4_temp_array_ht); i4_y++)
976             {
977                 pu1_refarray_temp =
978                     pu1_refarray + (pi4_y_ref_pos_luma[i4_y] * i4_refarray_wd) + (i4_x_min - 1);
979                 inp_8x16b_r0 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp));
980                 inp_8x16b_r1 = _mm_loadu_si128((__m128i *) (pu1_refarray_temp + i4_refarray_wd));
981 
982                 inp_8x16b_r01_0 = _mm_unpacklo_epi8(inp_8x16b_r0, inp_8x16b_r1);
983                 inp_8x16b_r01_1 = _mm_unpackhi_epi8(inp_8x16b_r0, inp_8x16b_r1);
984 
985                 phs_mask_16x8b_r0 = _mm_set1_epi8(g_au1_interp_filter_chroma[pi4_phase_luma[i4_y]]);
986                 phs_mask_16x8b_r1 =
987                     _mm_set1_epi8(g_au1_interp_filter_chroma[pi4_phase_luma[i4_y] + 16]);
988 
989                 phs_mask_16x8b_r01_0 = _mm_unpacklo_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
990                 phs_mask_16x8b_r01_1 = _mm_unpackhi_epi8(phs_mask_16x8b_r0, phs_mask_16x8b_r1);
991 
992                 out_res_8x16b_r01_0 = _mm_maddubs_epi16(inp_8x16b_r01_0, phs_mask_16x8b_r01_0);
993                 out_res_8x16b_r01_1 = _mm_maddubs_epi16(inp_8x16b_r01_1, phs_mask_16x8b_r01_1);
994 
995                 _mm_storeu_si128(
996                     (__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) + (i4_x_min - 1)),
997                     out_res_8x16b_r01_0);
998                 _mm_storeu_si128((__m128i *) (pi2_interp_buff_temp + (i4_y * i4_refarray_wd) +
999                                               (i4_x_min - 1) + 8),
1000                                  out_res_8x16b_r01_1);
1001             }
1002         }
1003 
1004         {
1005             WORD32 strt_indx = 10;
1006             __m128i inp_8x16b_0, inp_8x16b_r0_0;
1007             __m128i phs_mask_16x8b_0;
1008             __m128i x_ref_pos_luma_mask_r0_0, x_ref_pos_luma_mask_r1_0;
1009             __m128i ip_filt_16x8b_r0, ip_filt_8x16b_r0_0, ip_filt_8x16b_r01_l_0;
1010             __m128i ip_filt_16x8b_r1, ip_filt_8x16b_r1_0, ip_filt_8x16b_r01_l_1;
1011             __m128i inp_8x16b_r1_0, inp_8x16b_r01_l_0, out_res_4x32b_r01_l_0;
1012             __m128i inp_8x16b_r01_l_1, out_res_4x32b_r01_l_1;
1013 
1014             __m128i out_res_4x32b_l_0;
1015             __m128i out_res_4x32b_l_1;
1016             __m128i out_res_8x16b_l;
1017             __m128i out_16x8b_r1;
1018             __m128i chroma_mask;
1019             __m128i const_512 = _mm_set1_epi32(512);
1020 
1021             WORD32 i4_x2 = 0;
1022             __m128i twos = _mm_set1_epi8(2);
1023             strt_indx = ps_x_pos_phase[0 + i4_frm_mb_x].i2_ref_pos;
1024             for(i4_x = 0; i4_x < i4_mb_wd; i4_x++)
1025             {
1026                 arr_x_ref_pos_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_ref_pos;
1027                 arr_phase_luma[i4_x] = (WORD8) ps_x_pos_phase[i4_x + i4_frm_mb_x].i2_phase;
1028                 arr_x_ref_pos_luma[i4_x] = arr_x_ref_pos_luma[i4_x] - strt_indx;
1029                 i4_x2 = i4_x << 1;
1030                 arr_x_ref_pos_luma_low[i4_x2] = (arr_x_ref_pos_luma[i4_x]) << 1;
1031                 arr_x_ref_pos_luma_low[i4_x2 + 1] = arr_x_ref_pos_luma_low[i4_x2] + 1;
1032             }
1033 
1034             pi4_x_ref_pos_luma_low = arr_x_ref_pos_luma_low;
1035             pi4_phase_luma = arr_phase_luma;
1036             phs_mask_16x8b_0 = _mm_loadu_si128((__m128i *) (pi4_phase_luma));
1037             x_ref_pos_luma_mask_r0_0 = _mm_loadu_si128((__m128i *) (pi4_x_ref_pos_luma_low));
1038             x_ref_pos_luma_mask_r1_0 = _mm_add_epi8(x_ref_pos_luma_mask_r0_0, twos);
1039 
1040             ip_filt_16x8b_r0 = _mm_loadu_si128((__m128i *) (g_au1_interp_filter_chroma));
1041             ip_filt_16x8b_r1 = _mm_loadu_si128((__m128i *) (g_au1_interp_filter_chroma + 16));
1042 
1043             ip_filt_8x16b_r0_0 =
1044                 _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r0, phs_mask_16x8b_0));
1045             ip_filt_8x16b_r1_0 =
1046                 _mm_cvtepi8_epi16(_mm_shuffle_epi8(ip_filt_16x8b_r1, phs_mask_16x8b_0));
1047 
1048             ip_filt_8x16b_r01_l_0 = _mm_unpacklo_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
1049             ip_filt_8x16b_r01_l_1 = _mm_unpackhi_epi16(ip_filt_8x16b_r0_0, ip_filt_8x16b_r1_0);
1050 
1051             for(i4_y = 0; i4_y < i4_temp_array_ht; i4_y++)
1052             {
1053                 inp_8x16b_0 = _mm_loadu_si128((__m128i *) (pi2_interp_buff_temp + strt_indx));
1054                 pi2_interp_buff_temp += i4_refarray_wd;
1055 
1056                 inp_8x16b_r0_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r0_0);
1057                 inp_8x16b_r1_0 = _mm_shuffle_epi8(inp_8x16b_0, x_ref_pos_luma_mask_r1_0);
1058 
1059                 inp_8x16b_r01_l_0 = _mm_unpacklo_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
1060                 inp_8x16b_r01_l_1 = _mm_unpackhi_epi16(inp_8x16b_r0_0, inp_8x16b_r1_0);
1061 
1062                 out_res_4x32b_r01_l_0 = _mm_madd_epi16(inp_8x16b_r01_l_0, ip_filt_8x16b_r01_l_0);
1063                 out_res_4x32b_r01_l_1 = _mm_madd_epi16(inp_8x16b_r01_l_1, ip_filt_8x16b_r01_l_1);
1064 
1065                 out_res_4x32b_l_0 =
1066                     _mm_srai_epi32(_mm_add_epi32(out_res_4x32b_r01_l_0, const_512), 10);
1067                 out_res_4x32b_l_1 =
1068                     _mm_srai_epi32(_mm_add_epi32(out_res_4x32b_r01_l_1, const_512), 10);
1069 
1070                 out_res_8x16b_l = _mm_packs_epi32(out_res_4x32b_l_0, out_res_4x32b_l_1);
1071 
1072                 chroma_mask = _mm_set1_epi16(0xFF00);
1073                 out_16x8b_r1 = _mm_loadu_si128((__m128i *) (pu1_out + (i4_y * i4_out_stride)));
1074                 out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
1075                 out_16x8b_r1 = _mm_add_epi8(out_res_8x16b_l, out_16x8b_r1);
1076                 _mm_storeu_si128((__m128i *) (pu1_out + (i4_y * i4_out_stride)), out_16x8b_r1);
1077             }
1078         }
1079     }
1080     return;
1081 } /* End of Interpolation Function */
1082 
1083 /*****************************************************************************/
1084 /*                                                                           */
1085 /*  Function Name : isvcd_vert_interpol_chroma_dyadic_1_sse42                 */
1086 /*                                                                           */
1087 /*  Description   : This function takes the reference array buffer & performs*/
1088 /*                    interpolation of a component to find the intra         */
1089 /*                     resampled value                                       */
1090 /*  Inputs        : pv_intra_samp_ctxt : intra sampling context              */
1091 /*                  pu1_out : output buffer pointer                          */
1092 /*                  i4_out_stride : output buffer stride                     */
1093 /*                  i4_refarray_wd : reference array width                   */
1094 /*                  i4_x_offset : offset in reference layer in horz direction*/
1095 /*                  ps_coord : current mb co-ordinate                        */
1096 /*                  i4_chroma_flag : chroma processing flag                  */
1097 /*  Globals       : none                                                     */
1098 /*  Processing    : it does the interpolation on horizontal direction        */
1099 /*  Outputs       : resampled pixels                                         */
1100 /*  Returns       : none                                                     */
1101 /*                                                                           */
1102 /*  Issues        : none                                                     */
1103 /*                                                                           */
1104 /*  Revision History:                                                        */
1105 /*                                                                           */
1106 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1107 /*         06 09 2021   Kishore              creation                        */
1108 /*                                                                           */
1109 /*****************************************************************************/
isvcd_vert_interpol_chroma_dyadic_1_sse42(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)1110 void isvcd_vert_interpol_chroma_dyadic_1_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
1111                                                WORD32 i4_phase_0, WORD32 i4_phase_1)
1112 {
1113     WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1114     WORD32 i4_filt_stride, i4_src_stride;
1115     UWORD8 *pu1_inp;
1116     WORD16 *pi2_tmp;
1117     __m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4,
1118         i4_samp_16x8b_5;
1119     __m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
1120         i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
1121     __m128i i4_res_8x16b_r7_temp;
1122     __m128i i4_c0_c1_16x8b, i4_c2_c3_16x8b;
1123 
1124     i4_coeff_0 = (WORD8) (8 - i4_phase_0);
1125     i4_coeff_1 = (WORD8) (i4_phase_0);
1126     i4_coeff_2 = (WORD8) (8 - i4_phase_1);
1127     i4_coeff_3 = (WORD8) (i4_phase_1);
1128 
1129     i4_c0_c1_16x8b =
1130         _mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
1131                      i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
1132                      i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
1133     i4_c2_c3_16x8b =
1134         _mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
1135                      i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
1136                      i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
1137 
1138     pu1_inp = pu1_inp_buf;
1139     pi2_tmp = pi2_tmp_filt_buf;
1140     i4_filt_stride = 6;
1141     i4_src_stride = DYADIC_REF_W_C;
1142 
1143     i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
1144     i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
1145     i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
1146     i4_samp_16x8b_3 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
1147     i4_samp_16x8b_4 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 2)));
1148     i4_samp_16x8b_5 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 2) + i4_src_stride));
1149 
1150     i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
1151     i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
1152     _mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
1153 
1154     i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
1155     i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
1156     _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
1157 
1158     i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
1159     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
1160 
1161     i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
1162     i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
1163     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
1164                      i4_res_8x16b_r3);
1165 
1166     i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
1167     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
1168 
1169     i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
1170     i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
1171     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
1172                      i4_res_8x16b_r5);
1173 
1174     i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
1175     _mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
1176                      i4_res_8x16b_r6);
1177 
1178     i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
1179 
1180     i4_samp_16x8b_4 = _mm_unpacklo_epi8(i4_samp_16x8b_4, i4_samp_16x8b_5);
1181 
1182     i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_4, i4_c2_c3_16x8b);
1183 
1184     i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
1185 
1186     i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
1187 
1188     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
1189                      i4_res_8x16b_r7_temp);
1190 }
1191 
1192 /*****************************************************************************/
1193 /*                                                                           */
1194 /*  Function Name : isvcd_vert_interpol_chroma_dyadic_2_sse42                 */
1195 /*                                                                           */
1196 /*  Description   : This function takes the reference array buffer & performs*/
1197 /*                  vertical intra resampling for dyadic scaling ratios for  */
1198 /*                  chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1199 /*                    chroma_phase_y_plus1:                                  */
1200 /*                        ref_lyr        cur_lyr                             */
1201 /*                            0            1                                 */
1202 /*                            0            2                                 */
1203 /*  Inputs        : pu1_inp_buf : ptr to the 6x6 reference sample buffer     */
1204 /*                    pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the   */
1205 /*                        vertically interpolated data                       */
1206 /*                    i4_phase_0 : y phase for even values of y              */
1207 /*                    i4_phase_1 : y phase for odd values of y               */
1208 /*  Globals       : none                                                     */
1209 /*  Processing    : it does the interpolation in vertical direction          */
1210 /*  Outputs       : vertically resampled samples                             */
1211 /*  Returns       : none                                                     */
1212 /*                                                                           */
1213 /*  Issues        : none                                                     */
1214 /*                                                                           */
1215 /*  Revision History:                                                        */
1216 /*                                                                           */
1217 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1218 /*         21 05 2021   Dolan          creation                              */
1219 /*                                                                           */
1220 /*****************************************************************************/
isvcd_vert_interpol_chroma_dyadic_2_sse42(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)1221 void isvcd_vert_interpol_chroma_dyadic_2_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
1222                                                WORD32 i4_phase_0, WORD32 i4_phase_1)
1223 {
1224     WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1225     WORD32 i4_filt_stride, i4_src_stride;
1226     UWORD8 *pu1_inp;
1227     WORD16 *pi2_tmp;
1228     __m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4;
1229     __m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
1230         i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
1231     __m128i i4_res_8x16b_r7_temp, i4_c0_c1_16x8b, i4_c2_c3_16x8b;
1232     i4_coeff_0 = (WORD8) (8 - i4_phase_0);
1233     i4_coeff_1 = (WORD8) (i4_phase_0);
1234     i4_coeff_2 = (WORD8) (8 - i4_phase_1);
1235     i4_coeff_3 = (WORD8) (i4_phase_1);
1236 
1237     i4_c0_c1_16x8b =
1238         _mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
1239                      i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
1240                      i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
1241     i4_c2_c3_16x8b =
1242         _mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
1243                      i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
1244                      i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
1245 
1246     pi2_tmp = pi2_tmp_filt_buf;
1247     i4_filt_stride = 6;
1248     i4_src_stride = DYADIC_REF_W_C;
1249     pu1_inp = pu1_inp_buf + i4_src_stride;
1250 
1251     i4_samp_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_inp));
1252     i4_samp_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_inp + i4_src_stride));
1253     i4_samp_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1)));
1254     i4_samp_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
1255     i4_samp_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2)));
1256 
1257     i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
1258     i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
1259     _mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
1260 
1261     i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c2_c3_16x8b);
1262     _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
1263 
1264     i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
1265     i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
1266     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
1267 
1268     i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
1269     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
1270                      i4_res_8x16b_r3);
1271 
1272     i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
1273     i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
1274     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
1275 
1276     i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
1277     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
1278                      i4_res_8x16b_r5);
1279 
1280     i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
1281     i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
1282     _mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
1283                      i4_res_8x16b_r6);
1284 
1285     i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
1286     i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
1287     i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
1288     i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
1289 
1290     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
1291                      i4_res_8x16b_r7_temp);
1292 }
1293 
1294 /*****************************************************************************/
1295 /*                                                                           */
1296 /*  Function Name : isvcd_vert_interpol_chroma_dyadic_3_sse42                 */
1297 /*                                                                           */
1298 /*  Description   : This function takes the reference array buffer & performs*/
1299 /*                  vertical intra resampling for dyadic scaling ratios for  */
1300 /*                  chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1301 /*                  chroma_phase_y_plus1:                                    */
1302 /*                        ref_lyr        cur_lyr                             */
1303 /*                            2            0                                 */
1304 /*  Inputs        : pu1_inp_buf : ptr to the 6x6 reference sample buffer     */
1305 /*                    pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the   */
1306 /*                        vertically interpolated data                       */
1307 /*                    i4_phase_0 : y phase for even values of y              */
1308 /*                    i4_phase_1 : y phase for odd values of y               */
1309 /*  Globals       : none                                                     */
1310 /*  Processing    : it does the interpolation in vertical direction          */
1311 /*  Outputs       : vertically resampled samples                             */
1312 /*  Returns       : none                                                     */
1313 /*                                                                           */
1314 /*  Issues        : none                                                     */
1315 /*                                                                           */
1316 /*  Revision History:                                                        */
1317 /*                                                                           */
1318 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1319 /*         21 05 2021   Dolan          creation                              */
1320 /*                                                                           */
1321 /*****************************************************************************/
isvcd_vert_interpol_chroma_dyadic_3_sse42(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)1322 void isvcd_vert_interpol_chroma_dyadic_3_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
1323                                                WORD32 i4_phase_0, WORD32 i4_phase_1)
1324 {
1325     WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1326     WORD32 i4_filt_stride, i4_src_stride;
1327     UWORD8 *pu1_inp;
1328     WORD16 *pi2_tmp;
1329     __m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4;
1330     __m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
1331         i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
1332     __m128i i4_res_8x16b_r7_temp, i4_c0_c1_16x8b, i4_c2_c3_16x8b;
1333     i4_coeff_0 = (WORD8) (8 - i4_phase_0);
1334     i4_coeff_1 = (WORD8) (i4_phase_0);
1335     i4_coeff_2 = (WORD8) (8 - i4_phase_1);
1336     i4_coeff_3 = (WORD8) (i4_phase_1);
1337 
1338     i4_c0_c1_16x8b =
1339         _mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
1340                      i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
1341                      i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
1342     i4_c2_c3_16x8b =
1343         _mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
1344                      i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
1345                      i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
1346 
1347     pi2_tmp = pi2_tmp_filt_buf;
1348     i4_filt_stride = 6;
1349     i4_src_stride = DYADIC_REF_W_C;
1350     pu1_inp = pu1_inp_buf;
1351 
1352     i4_samp_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_inp));
1353     i4_samp_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_inp + i4_src_stride));
1354     i4_samp_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1)));
1355     i4_samp_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
1356     i4_samp_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2)));
1357 
1358     i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
1359     i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
1360     _mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
1361 
1362     i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c2_c3_16x8b);
1363     _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
1364 
1365     i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
1366     i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
1367     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
1368 
1369     i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
1370     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
1371                      i4_res_8x16b_r3);
1372 
1373     i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
1374     i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
1375     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
1376 
1377     i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
1378     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
1379                      i4_res_8x16b_r5);
1380 
1381     i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
1382     i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
1383     _mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
1384                      i4_res_8x16b_r6);
1385 
1386     i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
1387     i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
1388     i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
1389     i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
1390     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
1391                      i4_res_8x16b_r7_temp);
1392 }
1393 
1394 /*****************************************************************************/
1395 /*                                                                           */
1396 /*  Function Name : isvcd_horz_interpol_chroma_dyadic_1_sse42                 */
1397 /*                                                                           */
1398 /*  Description   : This function takes the reference array buffer & performs*/
1399 /*                  vertical intra resampling for dyadic scaling ratios for  */
1400 /*                  chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1401 /*                  chroma_phase_y_plus1:                                    */
1402 /*                        ref_lyr        cur_lyr                             */
1403 /*                            2            0                                 */
1404 /*  Inputs        : pu1_inp_buf : ptr to the 6x6 reference sample buffer     */
1405 /*                    pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold       */
1406 /*                        vertically interpolated data                       */
1407 /*                    i4_phase_0 : y phase for even values of y              */
1408 /*                    i4_phase_1 : y phase for odd values of y               */
1409 /*  Globals       : none                                                     */
1410 /*  Processing    : it does the interpolation in vertical direction          */
1411 /*  Outputs       : vertically resampled samples                             */
1412 /*  Returns       : none                                                     */
1413 /*                                                                           */
1414 /*  Issues        : none                                                     */
1415 /*                                                                           */
1416 /*  Revision History:                                                        */
1417 /*                                                                           */
1418 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1419 /*         21 05 2021   Dolan          creation                              */
1420 /*                                                                           */
1421 /*****************************************************************************/
isvcd_horz_interpol_chroma_dyadic_1_sse42(WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride,WORD32 i4_phase_0,WORD32 i4_phase_1)1422 void isvcd_horz_interpol_chroma_dyadic_1_sse42(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
1423                                                WORD32 i4_out_stride, WORD32 i4_phase_0,
1424                                                WORD32 i4_phase_1)
1425 {
1426     WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1427     WORD32 i4_dst_stride, i4_dst_stride2, i4_dst_stride4;
1428     UWORD8 *pu1_out;
1429     WORD16 *pi2_tmp;
1430 
1431     __m128i i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1, i4_samp_8x16b_r1_2;
1432     __m128i i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1, i4_samp_8x16b_r2_2;
1433     __m128i i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2;
1434     __m128i i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2;
1435     __m128i i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2;
1436     __m128i i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2;
1437     __m128i i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2;
1438     __m128i i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2;
1439 
1440     __m128i i4_res_4x32b_r1_0, i4_res_4x32b_r1_1;
1441     __m128i i4_res_4x32b_r2_0, i4_res_4x32b_r2_1;
1442     __m128i i4_res_4x32b_r3_0, i4_res_4x32b_r3_1;
1443     __m128i i4_res_4x32b_r4_0, i4_res_4x32b_r4_1;
1444     __m128i i4_res_4x32b_r5_0, i4_res_4x32b_r5_1;
1445     __m128i i4_res_4x32b_r6_0, i4_res_4x32b_r6_1;
1446     __m128i i4_res_4x32b_r7_0, i4_res_4x32b_r7_1;
1447     __m128i i4_res_4x32b_r8_0, i4_res_4x32b_r8_1;
1448 
1449     __m128i i4_res_final_8x16b_r1;
1450     __m128i i4_res_final_8x16b_r2;
1451     __m128i i4_res_final_8x16b_r3;
1452     __m128i i4_res_final_8x16b_r4;
1453     __m128i i4_res_final_8x16b_r5;
1454     __m128i i4_res_final_8x16b_r6;
1455     __m128i i4_res_final_8x16b_r7;
1456     __m128i i4_res_final_8x16b_r8;
1457 
1458     __m128i out_16x8b_r1;
1459     __m128i out_16x8b_r2;
1460     __m128i out_16x8b_r3;
1461     __m128i out_16x8b_r4;
1462     __m128i out_16x8b_r5;
1463     __m128i out_16x8b_r6;
1464     __m128i out_16x8b_r7;
1465     __m128i out_16x8b_r8;
1466     __m128i i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1;
1467     __m128i i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1;
1468     __m128i i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1;
1469     __m128i i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1;
1470     __m128i chroma_mask, chroma_mask2;
1471     __m128i coeff_c0_c1_8x16b, coeff_c2_c3_8x16b, res_32;
1472 
1473     i4_coeff_0 = 8 - i4_phase_0;
1474     i4_coeff_1 = i4_phase_0;
1475     i4_coeff_2 = 8 - i4_phase_1;
1476     i4_coeff_3 = i4_phase_1;
1477     coeff_c0_c1_8x16b = _mm_set_epi16(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1,
1478                                       i4_coeff_0, i4_coeff_1, i4_coeff_0);
1479     coeff_c2_c3_8x16b = _mm_set_epi16(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3,
1480                                       i4_coeff_2, i4_coeff_3, i4_coeff_2);
1481     res_32 = _mm_set1_epi32(32);
1482     pu1_out = pu1_out_buf;
1483     pi2_tmp = pi2_tmp_filt_buf;
1484     i4_dst_stride = i4_out_stride;
1485 
1486     i4_dst_stride2 = i4_dst_stride << 1;
1487     i4_dst_stride4 = i4_dst_stride << 2;
1488 
1489     /* Horizontal interpolation */
1490     /* x = 0, x_phase = phase_0 */
1491     i4_samp_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_tmp);         // a0 a1 a2 a3 a4 a5 a6 a7
1492     i4_samp_8x16b_r2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 6));   // b0 b1 b2 b3 b4 b5 b6 b7
1493     i4_samp_8x16b_r3_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 12));  // b0 b1 b2 b3 b4 b5 b6 b7
1494     i4_samp_8x16b_r4_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 18));  // b0 b1 b2 b3 b4 b5 b6 b7
1495     i4_samp_8x16b_r5_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 24));  // b0 b1 b2 b3 b4 b5 b6 b7
1496     i4_samp_8x16b_r6_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 30));  // b0 b1 b2 b3 b4 b5 b6 b7
1497     i4_samp_8x16b_r7_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 36));  // b0 b1 b2 b3 b4 b5 b6 b7
1498     i4_samp_8x16b_r8_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 42));  // b0 b1 b2 b3 b4 b5 b6 b7
1499 
1500     i4_samp_8x16b_r1_1 = _mm_srli_si128(i4_samp_8x16b_r1_0, 2);        // a1 a2 a3 a4 a5 a6 a7 0
1501     i4_samp_8x16b_r1_2 = _mm_srli_si128(i4_samp_8x16b_r1_0, 4);        // a2 a3 a4 a5 a6 a7 0 0
1502 
1503     i4_samp_8x16b_r2_1 = _mm_srli_si128(i4_samp_8x16b_r2_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1504     i4_samp_8x16b_r2_2 = _mm_srli_si128(i4_samp_8x16b_r2_0, 4);        // b2 b3 b4 b5 b6 b7 0 0
1505 
1506     i4_samp_8x16b_r3_1 = _mm_srli_si128(i4_samp_8x16b_r3_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1507     i4_samp_8x16b_r3_2 = _mm_srli_si128(i4_samp_8x16b_r3_0, 4);        // b2 b3 b4 b5 b6 b7 0 0
1508 
1509     i4_samp_8x16b_r4_1 = _mm_srli_si128(i4_samp_8x16b_r4_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1510     i4_samp_8x16b_r4_2 = _mm_srli_si128(i4_samp_8x16b_r4_0, 4);        // b2 b3 b4 b5 b6 b7 0 0
1511 
1512     i4_samp_8x16b_r5_1 = _mm_srli_si128(i4_samp_8x16b_r5_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1513     i4_samp_8x16b_r5_2 = _mm_srli_si128(i4_samp_8x16b_r5_0, 4);        // b2 b3 b4 b5 b6 b7 0 0
1514 
1515     i4_samp_8x16b_r6_1 = _mm_srli_si128(i4_samp_8x16b_r6_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1516     i4_samp_8x16b_r6_2 = _mm_srli_si128(i4_samp_8x16b_r6_0, 4);        // b2 b3 b4 b5 b6 b7 0 0
1517 
1518     i4_samp_8x16b_r7_1 = _mm_srli_si128(i4_samp_8x16b_r7_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1519     i4_samp_8x16b_r7_2 = _mm_srli_si128(i4_samp_8x16b_r7_0, 4);        // b2 b3 b4 b5 b6 b7 0 0
1520 
1521     i4_samp_8x16b_r8_1 = _mm_srli_si128(i4_samp_8x16b_r8_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1522     i4_samp_8x16b_r8_2 = _mm_srli_si128(i4_samp_8x16b_r8_0, 4);        // b2 b3 b4 b5 b6 b7 0 0
1523 
1524     i4_samp_8x16b_r1_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_0,
1525                                             i4_samp_8x16b_r1_1);  // a0 a1  a1 a2  a2 a3  a3 a4
1526     i4_samp_8x16b_r2_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_0,
1527                                             i4_samp_8x16b_r2_1);  // b0 b1  b1 b2  b2 b3  b3 b4
1528     i4_samp_8x16b_r3_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1);
1529     i4_samp_8x16b_r4_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1);
1530     i4_samp_8x16b_r5_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1);
1531     i4_samp_8x16b_r6_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1);
1532     i4_samp_8x16b_r7_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1);
1533     i4_samp_8x16b_r8_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1);
1534 
1535     i4_samp_8x16b_r1_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_1,
1536                                             i4_samp_8x16b_r1_2);  // a1 a2  a2 a3  a3 a4  a4 a5
1537     i4_samp_8x16b_r2_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_1,
1538                                             i4_samp_8x16b_r2_2);  // b1 b2  b2 b3  b3 b4  b4 b5
1539     i4_samp_8x16b_r3_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2);
1540     i4_samp_8x16b_r4_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2);
1541     i4_samp_8x16b_r5_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2);
1542     i4_samp_8x16b_r6_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2);
1543     i4_samp_8x16b_r7_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2);
1544     i4_samp_8x16b_r8_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2);
1545 
1546     // a0c0+a1c1  a1c0+a2c1  a2c0+a3c1  a3c0+a4c1
1547     i4_res_4x32b_r1_0 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c0_c1_8x16b);
1548     // b0c0+b1c1  b1c0+b2c1  b2c0+b3c1  b3c0+b4c1
1549     i4_res_4x32b_r2_0 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c0_c1_8x16b);
1550     i4_res_4x32b_r3_0 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c0_c1_8x16b);
1551     i4_res_4x32b_r4_0 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c0_c1_8x16b);
1552     i4_res_4x32b_r5_0 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c0_c1_8x16b);
1553     i4_res_4x32b_r6_0 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c0_c1_8x16b);
1554     i4_res_4x32b_r7_0 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c0_c1_8x16b);
1555     i4_res_4x32b_r8_0 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c0_c1_8x16b);
1556 
1557     // a1c2+a2c3  a2c2+a3c3  a3c2+a4c3  a4c2+a5c3
1558     i4_res_4x32b_r1_1 = _mm_madd_epi16(i4_samp_8x16b_r1_1, coeff_c2_c3_8x16b);
1559     // b1c2+b2c3  b2c2+b3c3  b3c2+b4c3  b4c2+b5c3
1560     i4_res_4x32b_r2_1 = _mm_madd_epi16(i4_samp_8x16b_r2_1, coeff_c2_c3_8x16b);
1561     i4_res_4x32b_r3_1 = _mm_madd_epi16(i4_samp_8x16b_r3_1, coeff_c2_c3_8x16b);
1562     i4_res_4x32b_r4_1 = _mm_madd_epi16(i4_samp_8x16b_r4_1, coeff_c2_c3_8x16b);
1563     i4_res_4x32b_r5_1 = _mm_madd_epi16(i4_samp_8x16b_r5_1, coeff_c2_c3_8x16b);
1564     i4_res_4x32b_r6_1 = _mm_madd_epi16(i4_samp_8x16b_r6_1, coeff_c2_c3_8x16b);
1565     i4_res_4x32b_r7_1 = _mm_madd_epi16(i4_samp_8x16b_r7_1, coeff_c2_c3_8x16b);
1566     i4_res_4x32b_r8_1 = _mm_madd_epi16(i4_samp_8x16b_r8_1, coeff_c2_c3_8x16b);
1567 
1568     i4_res_4x32b_r1_0 = _mm_add_epi32(i4_res_4x32b_r1_0, res_32);
1569     i4_res_4x32b_r2_0 = _mm_add_epi32(i4_res_4x32b_r2_0, res_32);
1570     i4_res_4x32b_r3_0 = _mm_add_epi32(i4_res_4x32b_r3_0, res_32);
1571     i4_res_4x32b_r4_0 = _mm_add_epi32(i4_res_4x32b_r4_0, res_32);
1572     i4_res_4x32b_r5_0 = _mm_add_epi32(i4_res_4x32b_r5_0, res_32);
1573     i4_res_4x32b_r6_0 = _mm_add_epi32(i4_res_4x32b_r6_0, res_32);
1574     i4_res_4x32b_r7_0 = _mm_add_epi32(i4_res_4x32b_r7_0, res_32);
1575     i4_res_4x32b_r8_0 = _mm_add_epi32(i4_res_4x32b_r8_0, res_32);
1576 
1577     i4_res_4x32b_r1_1 = _mm_add_epi32(i4_res_4x32b_r1_1, res_32);
1578     i4_res_4x32b_r2_1 = _mm_add_epi32(i4_res_4x32b_r2_1, res_32);
1579     i4_res_4x32b_r3_1 = _mm_add_epi32(i4_res_4x32b_r3_1, res_32);
1580     i4_res_4x32b_r4_1 = _mm_add_epi32(i4_res_4x32b_r4_1, res_32);
1581     i4_res_4x32b_r5_1 = _mm_add_epi32(i4_res_4x32b_r5_1, res_32);
1582     i4_res_4x32b_r6_1 = _mm_add_epi32(i4_res_4x32b_r6_1, res_32);
1583     i4_res_4x32b_r7_1 = _mm_add_epi32(i4_res_4x32b_r7_1, res_32);
1584     i4_res_4x32b_r8_1 = _mm_add_epi32(i4_res_4x32b_r8_1, res_32);
1585 
1586     i4_res_4x32b_r1_0 = _mm_srai_epi32(i4_res_4x32b_r1_0, 6);
1587     i4_res_4x32b_r2_0 = _mm_srai_epi32(i4_res_4x32b_r2_0, 6);
1588     i4_res_4x32b_r3_0 = _mm_srai_epi32(i4_res_4x32b_r3_0, 6);
1589     i4_res_4x32b_r4_0 = _mm_srai_epi32(i4_res_4x32b_r4_0, 6);
1590     i4_res_4x32b_r5_0 = _mm_srai_epi32(i4_res_4x32b_r5_0, 6);
1591     i4_res_4x32b_r6_0 = _mm_srai_epi32(i4_res_4x32b_r6_0, 6);
1592     i4_res_4x32b_r7_0 = _mm_srai_epi32(i4_res_4x32b_r7_0, 6);
1593     i4_res_4x32b_r8_0 = _mm_srai_epi32(i4_res_4x32b_r8_0, 6);
1594 
1595     i4_res_4x32b_r1_1 = _mm_srai_epi32(i4_res_4x32b_r1_1, 6);
1596     i4_res_4x32b_r2_1 = _mm_srai_epi32(i4_res_4x32b_r2_1, 6);
1597     i4_res_4x32b_r3_1 = _mm_srai_epi32(i4_res_4x32b_r3_1, 6);
1598     i4_res_4x32b_r4_1 = _mm_srai_epi32(i4_res_4x32b_r4_1, 6);
1599     i4_res_4x32b_r5_1 = _mm_srai_epi32(i4_res_4x32b_r5_1, 6);
1600     i4_res_4x32b_r6_1 = _mm_srai_epi32(i4_res_4x32b_r6_1, 6);
1601     i4_res_4x32b_r7_1 = _mm_srai_epi32(i4_res_4x32b_r7_1, 6);
1602     i4_res_4x32b_r8_1 = _mm_srai_epi32(i4_res_4x32b_r8_1, 6);
1603 
1604     i4_res_final_8x16b_r12_0 = _mm_packs_epi32(i4_res_4x32b_r1_0, i4_res_4x32b_r2_0);
1605     i4_res_final_8x16b_r34_0 = _mm_packs_epi32(i4_res_4x32b_r3_0, i4_res_4x32b_r4_0);
1606     i4_res_final_8x16b_r56_0 = _mm_packs_epi32(i4_res_4x32b_r5_0, i4_res_4x32b_r6_0);
1607     i4_res_final_8x16b_r67_0 = _mm_packs_epi32(i4_res_4x32b_r7_0, i4_res_4x32b_r8_0);
1608 
1609     i4_res_final_8x16b_r12_1 = _mm_packs_epi32(i4_res_4x32b_r1_1, i4_res_4x32b_r2_1);
1610     i4_res_final_8x16b_r34_1 = _mm_packs_epi32(i4_res_4x32b_r3_1, i4_res_4x32b_r4_1);
1611     i4_res_final_8x16b_r56_1 = _mm_packs_epi32(i4_res_4x32b_r5_1, i4_res_4x32b_r6_1);
1612     i4_res_final_8x16b_r67_1 = _mm_packs_epi32(i4_res_4x32b_r7_1, i4_res_4x32b_r8_1);
1613 
1614     i4_res_final_8x16b_r1 = _mm_unpacklo_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
1615     i4_res_final_8x16b_r2 = _mm_unpackhi_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
1616     i4_res_final_8x16b_r3 = _mm_unpacklo_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
1617     i4_res_final_8x16b_r4 = _mm_unpackhi_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
1618     i4_res_final_8x16b_r5 = _mm_unpacklo_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
1619     i4_res_final_8x16b_r6 = _mm_unpackhi_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
1620     i4_res_final_8x16b_r7 = _mm_unpacklo_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
1621     i4_res_final_8x16b_r8 = _mm_unpackhi_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
1622 
1623     chroma_mask = _mm_set1_epi16(0xFF00);
1624     chroma_mask2 = _mm_set1_epi16(0x00FF);
1625     out_16x8b_r1 = _mm_loadu_si128((__m128i *) (&pu1_out[0]));
1626     out_16x8b_r2 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride]));
1627     out_16x8b_r3 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2]));
1628     out_16x8b_r4 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2 + i4_dst_stride]));
1629     out_16x8b_r5 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4]));
1630     out_16x8b_r6 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride]));
1631     out_16x8b_r7 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2]));
1632     out_16x8b_r8 =
1633         _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2 + i4_dst_stride]));
1634 
1635     out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
1636     out_16x8b_r2 = _mm_and_si128(out_16x8b_r2, chroma_mask);
1637     out_16x8b_r3 = _mm_and_si128(out_16x8b_r3, chroma_mask);
1638     out_16x8b_r4 = _mm_and_si128(out_16x8b_r4, chroma_mask);
1639     out_16x8b_r5 = _mm_and_si128(out_16x8b_r5, chroma_mask);
1640     out_16x8b_r6 = _mm_and_si128(out_16x8b_r6, chroma_mask);
1641     out_16x8b_r7 = _mm_and_si128(out_16x8b_r7, chroma_mask);
1642     out_16x8b_r8 = _mm_and_si128(out_16x8b_r8, chroma_mask);
1643 
1644     i4_res_final_8x16b_r1 = _mm_and_si128(i4_res_final_8x16b_r1, chroma_mask2);
1645     i4_res_final_8x16b_r2 = _mm_and_si128(i4_res_final_8x16b_r2, chroma_mask2);
1646     i4_res_final_8x16b_r3 = _mm_and_si128(i4_res_final_8x16b_r3, chroma_mask2);
1647     i4_res_final_8x16b_r4 = _mm_and_si128(i4_res_final_8x16b_r4, chroma_mask2);
1648     i4_res_final_8x16b_r5 = _mm_and_si128(i4_res_final_8x16b_r5, chroma_mask2);
1649     i4_res_final_8x16b_r6 = _mm_and_si128(i4_res_final_8x16b_r6, chroma_mask2);
1650     i4_res_final_8x16b_r7 = _mm_and_si128(i4_res_final_8x16b_r7, chroma_mask2);
1651     i4_res_final_8x16b_r8 = _mm_and_si128(i4_res_final_8x16b_r8, chroma_mask2);
1652 
1653     out_16x8b_r1 = _mm_add_epi8(i4_res_final_8x16b_r1, out_16x8b_r1);
1654     out_16x8b_r2 = _mm_add_epi8(i4_res_final_8x16b_r2, out_16x8b_r2);
1655     out_16x8b_r3 = _mm_add_epi8(i4_res_final_8x16b_r3, out_16x8b_r3);
1656     out_16x8b_r4 = _mm_add_epi8(i4_res_final_8x16b_r4, out_16x8b_r4);
1657     out_16x8b_r5 = _mm_add_epi8(i4_res_final_8x16b_r5, out_16x8b_r5);
1658     out_16x8b_r6 = _mm_add_epi8(i4_res_final_8x16b_r6, out_16x8b_r6);
1659     out_16x8b_r7 = _mm_add_epi8(i4_res_final_8x16b_r7, out_16x8b_r7);
1660     out_16x8b_r8 = _mm_add_epi8(i4_res_final_8x16b_r8, out_16x8b_r8);
1661 
1662     _mm_storeu_si128((__m128i *) pu1_out, out_16x8b_r1);
1663     _mm_storeu_si128((__m128i *) (pu1_out + i4_dst_stride), out_16x8b_r2);
1664     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 1)), out_16x8b_r3);
1665     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 3)), out_16x8b_r4);
1666     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 2)), out_16x8b_r5);
1667     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 5)), out_16x8b_r6);
1668     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 6)), out_16x8b_r7);
1669     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 7)), out_16x8b_r8);
1670     /* End of loop over x */
1671 } /* isvcd_horz_interpol_chroma_dyadic_1_sse42 */
1672 
1673 /*****************************************************************************/
1674 /*                                                                           */
1675 /*  Function Name : isvcd_horz_interpol_chroma_dyadic_2_sse42                 */
1676 /*                                                                           */
1677 /*  Description   : This function takes the reference array buffer & performs*/
1678 /*                  vertical intra resampling for dyadic scaling ratios for  */
1679 /*                  chroma for the following ref_lyr_chroma_phase_y_plus1 and*/
1680 /*                  chroma_phase_y_plus1:                                    */
1681 /*                        ref_lyr        cur_lyr                             */
1682 /*                            2            0                                 */
1683 /*  Inputs        : pu1_inp_buf : ptr to the 6x6 reference sample buffer     */
1684 /*                    pi2_tmp_filt_buf : ptr to the 6x8 buffer to hold the   */
1685 /*                        vertically interpolated data                       */
1686 /*                    i4_phase_0 : y phase for even values of y              */
1687 /*                    i4_phase_1 : y phase for odd values of y               */
1688 /*  Globals       : none                                                     */
1689 /*  Processing    : it does the interpolation in vertical direction          */
1690 /*  Outputs       : vertically resampled samples                             */
1691 /*  Returns       : none                                                     */
1692 /*                                                                           */
1693 /*  Issues        : none                                                     */
1694 /*                                                                           */
1695 /*  Revision History:                                                        */
1696 /*                                                                           */
1697 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1698 /*         21 05 2021   Dolan          creation                              */
1699 /*                                                                           */
1700 /*****************************************************************************/
isvcd_horz_interpol_chroma_dyadic_2_sse42(WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride,WORD32 i4_phase_0,WORD32 i4_phase_1)1701 void isvcd_horz_interpol_chroma_dyadic_2_sse42(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
1702                                                WORD32 i4_out_stride, WORD32 i4_phase_0,
1703                                                WORD32 i4_phase_1)
1704 {
1705     WORD32 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
1706     WORD32 i4_dst_stride, i4_dst_stride2, i4_dst_stride4;
1707     UWORD8 *pu1_out;
1708     WORD16 *pi2_tmp;
1709 
1710     __m128i i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1;
1711     __m128i i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1;
1712     __m128i i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1;
1713     __m128i i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1;
1714     __m128i i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1;
1715     __m128i i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1;
1716     __m128i i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1;
1717     __m128i i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1;
1718 
1719     __m128i i4_res_4x32b_r1_0, i4_res_4x32b_r1_1;
1720     __m128i i4_res_4x32b_r2_0, i4_res_4x32b_r2_1;
1721     __m128i i4_res_4x32b_r3_0, i4_res_4x32b_r3_1;
1722     __m128i i4_res_4x32b_r4_0, i4_res_4x32b_r4_1;
1723     __m128i i4_res_4x32b_r5_0, i4_res_4x32b_r5_1;
1724     __m128i i4_res_4x32b_r6_0, i4_res_4x32b_r6_1;
1725     __m128i i4_res_4x32b_r7_0, i4_res_4x32b_r7_1;
1726     __m128i i4_res_4x32b_r8_0, i4_res_4x32b_r8_1;
1727 
1728     __m128i i4_res_final_8x16b_r1;
1729     __m128i i4_res_final_8x16b_r2;
1730     __m128i i4_res_final_8x16b_r3;
1731     __m128i i4_res_final_8x16b_r4;
1732     __m128i i4_res_final_8x16b_r5;
1733     __m128i i4_res_final_8x16b_r6;
1734     __m128i i4_res_final_8x16b_r7;
1735     __m128i i4_res_final_8x16b_r8;
1736 
1737     __m128i out_16x8b_r1;
1738     __m128i out_16x8b_r2;
1739     __m128i out_16x8b_r3;
1740     __m128i out_16x8b_r4;
1741     __m128i out_16x8b_r5;
1742     __m128i out_16x8b_r6;
1743     __m128i out_16x8b_r7;
1744     __m128i out_16x8b_r8;
1745     __m128i i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1;
1746     __m128i i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1;
1747     __m128i i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1;
1748     __m128i i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1;
1749     __m128i chroma_mask, chroma_mask2;
1750     __m128i coeff_c0_c1_8x16b, coeff_c2_c3_8x16b, res_32;
1751 
1752     i4_coeff_0 = 8 - i4_phase_0;
1753     i4_coeff_1 = i4_phase_0;
1754     i4_coeff_2 = 8 - i4_phase_1;
1755     i4_coeff_3 = i4_phase_1;
1756     coeff_c0_c1_8x16b = _mm_set_epi16(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1,
1757                                       i4_coeff_0, i4_coeff_1, i4_coeff_0);
1758     coeff_c2_c3_8x16b = _mm_set_epi16(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3,
1759                                       i4_coeff_2, i4_coeff_3, i4_coeff_2);
1760     res_32 = _mm_set1_epi32(32);
1761     pu1_out = pu1_out_buf;
1762     pi2_tmp = pi2_tmp_filt_buf + 1;
1763     i4_dst_stride = i4_out_stride;
1764 
1765     i4_dst_stride2 = i4_dst_stride << 1;
1766     i4_dst_stride4 = i4_dst_stride << 2;
1767 
1768     /* Horizontal interpolation */
1769     /* x = 0, x_phase = phase_0 */
1770     i4_samp_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_tmp);         // a0 a1 a2 a3 a4 a5 a6 a7
1771     i4_samp_8x16b_r2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 6));   // b0 b1 b2 b3 b4 b5 b6 b7
1772     i4_samp_8x16b_r3_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 12));  // b0 b1 b2 b3 b4 b5 b6 b7
1773     i4_samp_8x16b_r4_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 18));  // b0 b1 b2 b3 b4 b5 b6 b7
1774     i4_samp_8x16b_r5_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 24));  // b0 b1 b2 b3 b4 b5 b6 b7
1775     i4_samp_8x16b_r6_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 30));  // b0 b1 b2 b3 b4 b5 b6 b7
1776     i4_samp_8x16b_r7_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 36));  // b0 b1 b2 b3 b4 b5 b6 b7
1777     i4_samp_8x16b_r8_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 42));  // b0 b1 b2 b3 b4 b5 b6 b7
1778 
1779     i4_samp_8x16b_r1_1 = _mm_srli_si128(i4_samp_8x16b_r1_0, 2);        // a1 a2 a3 a4 a5 a6 a7 0
1780     i4_samp_8x16b_r2_1 = _mm_srli_si128(i4_samp_8x16b_r2_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1781     i4_samp_8x16b_r3_1 = _mm_srli_si128(i4_samp_8x16b_r3_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1782     i4_samp_8x16b_r4_1 = _mm_srli_si128(i4_samp_8x16b_r4_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1783     i4_samp_8x16b_r5_1 = _mm_srli_si128(i4_samp_8x16b_r5_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1784     i4_samp_8x16b_r6_1 = _mm_srli_si128(i4_samp_8x16b_r6_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1785     i4_samp_8x16b_r7_1 = _mm_srli_si128(i4_samp_8x16b_r7_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1786     i4_samp_8x16b_r8_1 = _mm_srli_si128(i4_samp_8x16b_r8_0, 2);        // b1 b2 b3 b4 b5 b6 b7 0
1787 
1788     i4_samp_8x16b_r1_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_0,
1789                                             i4_samp_8x16b_r1_1);  // a0 a1  a1 a2  a2 a3  a3 a4
1790     i4_samp_8x16b_r2_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_0,
1791                                             i4_samp_8x16b_r2_1);  // b0 b1  b1 b2  b2 b3  b3 b4
1792     i4_samp_8x16b_r3_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1);
1793     i4_samp_8x16b_r4_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1);
1794     i4_samp_8x16b_r5_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1);
1795     i4_samp_8x16b_r6_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1);
1796     i4_samp_8x16b_r7_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1);
1797     i4_samp_8x16b_r8_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1);
1798 
1799     // a0c0+a1c1  a1c0+a2c1  a2c0+a3c1  a3c0+a4c1
1800     i4_res_4x32b_r1_0 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c0_c1_8x16b);
1801     // b0c0+b1c1  b1c0+b2c1  b2c0+b3c1  b3c0+b4c1
1802     i4_res_4x32b_r2_0 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c0_c1_8x16b);
1803     i4_res_4x32b_r3_0 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c0_c1_8x16b);
1804     i4_res_4x32b_r4_0 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c0_c1_8x16b);
1805     i4_res_4x32b_r5_0 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c0_c1_8x16b);
1806     i4_res_4x32b_r6_0 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c0_c1_8x16b);
1807     i4_res_4x32b_r7_0 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c0_c1_8x16b);
1808     i4_res_4x32b_r8_0 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c0_c1_8x16b);
1809 
1810     // a1c2+a2c3  a2c2+a3c3  a3c2+a4c3  a4c2+a5c3
1811     i4_res_4x32b_r1_1 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c2_c3_8x16b);
1812     // b1c2+b2c3  b2c2+b3c3  b3c2+b4c3  b4c2+b5c3
1813     i4_res_4x32b_r2_1 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c2_c3_8x16b);
1814     i4_res_4x32b_r3_1 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c2_c3_8x16b);
1815     i4_res_4x32b_r4_1 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c2_c3_8x16b);
1816     i4_res_4x32b_r5_1 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c2_c3_8x16b);
1817     i4_res_4x32b_r6_1 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c2_c3_8x16b);
1818     i4_res_4x32b_r7_1 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c2_c3_8x16b);
1819     i4_res_4x32b_r8_1 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c2_c3_8x16b);
1820 
1821     i4_res_4x32b_r1_0 = _mm_add_epi32(i4_res_4x32b_r1_0, res_32);
1822     i4_res_4x32b_r2_0 = _mm_add_epi32(i4_res_4x32b_r2_0, res_32);
1823     i4_res_4x32b_r3_0 = _mm_add_epi32(i4_res_4x32b_r3_0, res_32);
1824     i4_res_4x32b_r4_0 = _mm_add_epi32(i4_res_4x32b_r4_0, res_32);
1825     i4_res_4x32b_r5_0 = _mm_add_epi32(i4_res_4x32b_r5_0, res_32);
1826     i4_res_4x32b_r6_0 = _mm_add_epi32(i4_res_4x32b_r6_0, res_32);
1827     i4_res_4x32b_r7_0 = _mm_add_epi32(i4_res_4x32b_r7_0, res_32);
1828     i4_res_4x32b_r8_0 = _mm_add_epi32(i4_res_4x32b_r8_0, res_32);
1829 
1830     i4_res_4x32b_r1_1 = _mm_add_epi32(i4_res_4x32b_r1_1, res_32);
1831     i4_res_4x32b_r2_1 = _mm_add_epi32(i4_res_4x32b_r2_1, res_32);
1832     i4_res_4x32b_r3_1 = _mm_add_epi32(i4_res_4x32b_r3_1, res_32);
1833     i4_res_4x32b_r4_1 = _mm_add_epi32(i4_res_4x32b_r4_1, res_32);
1834     i4_res_4x32b_r5_1 = _mm_add_epi32(i4_res_4x32b_r5_1, res_32);
1835     i4_res_4x32b_r6_1 = _mm_add_epi32(i4_res_4x32b_r6_1, res_32);
1836     i4_res_4x32b_r7_1 = _mm_add_epi32(i4_res_4x32b_r7_1, res_32);
1837     i4_res_4x32b_r8_1 = _mm_add_epi32(i4_res_4x32b_r8_1, res_32);
1838 
1839     i4_res_4x32b_r1_0 = _mm_srai_epi32(i4_res_4x32b_r1_0, 6);
1840     i4_res_4x32b_r2_0 = _mm_srai_epi32(i4_res_4x32b_r2_0, 6);
1841     i4_res_4x32b_r3_0 = _mm_srai_epi32(i4_res_4x32b_r3_0, 6);
1842     i4_res_4x32b_r4_0 = _mm_srai_epi32(i4_res_4x32b_r4_0, 6);
1843     i4_res_4x32b_r5_0 = _mm_srai_epi32(i4_res_4x32b_r5_0, 6);
1844     i4_res_4x32b_r6_0 = _mm_srai_epi32(i4_res_4x32b_r6_0, 6);
1845     i4_res_4x32b_r7_0 = _mm_srai_epi32(i4_res_4x32b_r7_0, 6);
1846     i4_res_4x32b_r8_0 = _mm_srai_epi32(i4_res_4x32b_r8_0, 6);
1847 
1848     i4_res_4x32b_r1_1 = _mm_srai_epi32(i4_res_4x32b_r1_1, 6);
1849     i4_res_4x32b_r2_1 = _mm_srai_epi32(i4_res_4x32b_r2_1, 6);
1850     i4_res_4x32b_r3_1 = _mm_srai_epi32(i4_res_4x32b_r3_1, 6);
1851     i4_res_4x32b_r4_1 = _mm_srai_epi32(i4_res_4x32b_r4_1, 6);
1852     i4_res_4x32b_r5_1 = _mm_srai_epi32(i4_res_4x32b_r5_1, 6);
1853     i4_res_4x32b_r6_1 = _mm_srai_epi32(i4_res_4x32b_r6_1, 6);
1854     i4_res_4x32b_r7_1 = _mm_srai_epi32(i4_res_4x32b_r7_1, 6);
1855     i4_res_4x32b_r8_1 = _mm_srai_epi32(i4_res_4x32b_r8_1, 6);
1856 
1857     i4_res_final_8x16b_r12_0 = _mm_packs_epi32(i4_res_4x32b_r1_0, i4_res_4x32b_r2_0);
1858     i4_res_final_8x16b_r34_0 = _mm_packs_epi32(i4_res_4x32b_r3_0, i4_res_4x32b_r4_0);
1859     i4_res_final_8x16b_r56_0 = _mm_packs_epi32(i4_res_4x32b_r5_0, i4_res_4x32b_r6_0);
1860     i4_res_final_8x16b_r67_0 = _mm_packs_epi32(i4_res_4x32b_r7_0, i4_res_4x32b_r8_0);
1861 
1862     i4_res_final_8x16b_r12_1 = _mm_packs_epi32(i4_res_4x32b_r1_1, i4_res_4x32b_r2_1);
1863     i4_res_final_8x16b_r34_1 = _mm_packs_epi32(i4_res_4x32b_r3_1, i4_res_4x32b_r4_1);
1864     i4_res_final_8x16b_r56_1 = _mm_packs_epi32(i4_res_4x32b_r5_1, i4_res_4x32b_r6_1);
1865     i4_res_final_8x16b_r67_1 = _mm_packs_epi32(i4_res_4x32b_r7_1, i4_res_4x32b_r8_1);
1866 
1867     i4_res_final_8x16b_r1 = _mm_unpacklo_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
1868     i4_res_final_8x16b_r2 = _mm_unpackhi_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
1869     i4_res_final_8x16b_r3 = _mm_unpacklo_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
1870     i4_res_final_8x16b_r4 = _mm_unpackhi_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
1871     i4_res_final_8x16b_r5 = _mm_unpacklo_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
1872     i4_res_final_8x16b_r6 = _mm_unpackhi_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
1873     i4_res_final_8x16b_r7 = _mm_unpacklo_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
1874     i4_res_final_8x16b_r8 = _mm_unpackhi_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
1875 
1876     chroma_mask = _mm_set1_epi16(0xFF00);
1877     chroma_mask2 = _mm_set1_epi16(0x00FF);
1878     out_16x8b_r1 = _mm_loadu_si128((__m128i *) (&pu1_out[0]));
1879     out_16x8b_r2 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride]));
1880     out_16x8b_r3 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2]));
1881     out_16x8b_r4 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2 + i4_dst_stride]));
1882     out_16x8b_r5 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4]));
1883     out_16x8b_r6 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride]));
1884     out_16x8b_r7 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2]));
1885     out_16x8b_r8 =
1886         _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2 + i4_dst_stride]));
1887 
1888     out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
1889     out_16x8b_r2 = _mm_and_si128(out_16x8b_r2, chroma_mask);
1890     out_16x8b_r3 = _mm_and_si128(out_16x8b_r3, chroma_mask);
1891     out_16x8b_r4 = _mm_and_si128(out_16x8b_r4, chroma_mask);
1892     out_16x8b_r5 = _mm_and_si128(out_16x8b_r5, chroma_mask);
1893     out_16x8b_r6 = _mm_and_si128(out_16x8b_r6, chroma_mask);
1894     out_16x8b_r7 = _mm_and_si128(out_16x8b_r7, chroma_mask);
1895     out_16x8b_r8 = _mm_and_si128(out_16x8b_r8, chroma_mask);
1896 
1897     i4_res_final_8x16b_r1 = _mm_and_si128(i4_res_final_8x16b_r1, chroma_mask2);
1898     i4_res_final_8x16b_r2 = _mm_and_si128(i4_res_final_8x16b_r2, chroma_mask2);
1899     i4_res_final_8x16b_r3 = _mm_and_si128(i4_res_final_8x16b_r3, chroma_mask2);
1900     i4_res_final_8x16b_r4 = _mm_and_si128(i4_res_final_8x16b_r4, chroma_mask2);
1901     i4_res_final_8x16b_r5 = _mm_and_si128(i4_res_final_8x16b_r5, chroma_mask2);
1902     i4_res_final_8x16b_r6 = _mm_and_si128(i4_res_final_8x16b_r6, chroma_mask2);
1903     i4_res_final_8x16b_r7 = _mm_and_si128(i4_res_final_8x16b_r7, chroma_mask2);
1904     i4_res_final_8x16b_r8 = _mm_and_si128(i4_res_final_8x16b_r8, chroma_mask2);
1905 
1906     out_16x8b_r1 = _mm_add_epi8(i4_res_final_8x16b_r1, out_16x8b_r1);
1907     out_16x8b_r2 = _mm_add_epi8(i4_res_final_8x16b_r2, out_16x8b_r2);
1908     out_16x8b_r3 = _mm_add_epi8(i4_res_final_8x16b_r3, out_16x8b_r3);
1909     out_16x8b_r4 = _mm_add_epi8(i4_res_final_8x16b_r4, out_16x8b_r4);
1910     out_16x8b_r5 = _mm_add_epi8(i4_res_final_8x16b_r5, out_16x8b_r5);
1911     out_16x8b_r6 = _mm_add_epi8(i4_res_final_8x16b_r6, out_16x8b_r6);
1912     out_16x8b_r7 = _mm_add_epi8(i4_res_final_8x16b_r7, out_16x8b_r7);
1913     out_16x8b_r8 = _mm_add_epi8(i4_res_final_8x16b_r8, out_16x8b_r8);
1914 
1915     _mm_storeu_si128((__m128i *) pu1_out, out_16x8b_r1);
1916     _mm_storeu_si128((__m128i *) (pu1_out + i4_dst_stride), out_16x8b_r2);
1917     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 1)), out_16x8b_r3);
1918     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 3)), out_16x8b_r4);
1919     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride << 2)), out_16x8b_r5);
1920     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 5)), out_16x8b_r6);
1921     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 6)), out_16x8b_r7);
1922     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 7)), out_16x8b_r8);
1923 
1924     /* End of loop over x */
1925 }
1926