xref: /aosp_15_r20/external/libavc/encoder/x86/ih264e_half_pel_ssse3.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21  *******************************************************************************
22  * @file
23  *  ih264e_half_pel_ssse3.c
24  *
25  * @brief
26  *  Contains the x86 intrinsic function definitions for 6-tap vertical filter
27  *  and cascaded 2D filter used in motion estimation in H264 encoder.
28  *
29  * @author
30  *  ittiam
31  *
32  * @par List of Functions:
33  *  ih264e_sixtapfilter_horz_ssse3
34  *  ih264e_sixtap_filter_2dvh_vert_ssse3
35  *
36  * @remarks
37  *  none
38  *
39  *******************************************************************************
40  */
41 
42 /*****************************************************************************/
43 /* File Includes                                                             */
44 /*****************************************************************************/
45 
46 /* System include files */
47 #include <stdio.h>
48 #include <assert.h>
49 #include <limits.h>
50 
51 /* User include files */
52 #include "ih264_typedefs.h"
53 #include "ithread.h"
54 #include "ih264_platform_macros.h"
55 #include "ih264_defs.h"
56 #include "ih264e_half_pel.h"
57 #include "ih264_macros.h"
58 #include "ih264_inter_pred_filters.h"
59 #include "ih264_mem_fns.h"
60 #include "ih264_padding.h"
61 #include "ih264_intra_pred_filters.h"
62 #include "ih264_deblk_edge_filters.h"
63 
64 
65 /*****************************************************************************/
66 /* Function Definitions                                                      */
67 /*****************************************************************************/
68 /*
69 *******************************************************************************
70 *
71 * @brief
72 *  Interprediction luma filter for horizontal input(Filter run for width = 17
73 *  and height =16)
74 *
75 * @par Description:
76 *  Applies a 6 tap horizontal filter .The output is  clipped to 8 bits sec.
77 *  8.4.2.2.1 titled "Luma sample interpolation process"
78 *
79 * @param[in] pu1_src
80 *  UWORD8 pointer to the source
81 *
82 * @param[out] pu1_dst
83 *  UWORD8 pointer to the destination
84 *
85 * @param[in] src_strd
86 *  integer source stride
87 *
88 * @param[in] dst_strd
89 *  integer destination stride
90 *
91 * @returns
92 *  none
93 *
94 * @remarks
95 *  none
96 *
97 *******************************************************************************
98 */
ih264e_sixtapfilter_horz_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd)99 void ih264e_sixtapfilter_horz_ssse3(UWORD8 *pu1_src,
100                                     UWORD8 *pu1_dst,
101                                     WORD32 src_strd,
102                                     WORD32 dst_strd)
103 {
104     WORD32 ht;
105     WORD32 tmp;
106 
107     __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
108     __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
109 
110     __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
111     __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
112 
113     __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
114     __m128i const_val16_8x16b;
115 
116     ht = 16;
117     pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
118 
119     coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
120     coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
121     coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
122                                                  //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
123     const_val16_8x16b = _mm_set1_epi16(16);
124 
125     //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
126     //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
127     //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
128 
129     do
130     {
131         src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                     //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
132         src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));               //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
133 
134         src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
135         src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                      //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
136 
137         src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
138         src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
139 
140         res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
141                                                                                  //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
142         res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
143                                                                                  //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
144 
145         src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
146         src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
147 
148         src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
149         src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0
150 
151         src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
152         src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
153 
154         res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
155                                                                                  //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
156         res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);    //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
157                                                                                  //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
158 
159         src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
160         src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b4 b5 b6 b7 b8 b9....b15 0  0  0  0
161 
162         src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
163         src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0
164 
165         src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
166         src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
167 
168         res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5  a6*c4+a7*c5   a7*c4+a8*c5
169                                                                                  //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
170         res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);    //b4*c4+b5*c5 b5*c4+b6*c5  b6*c4+b7*c5   b7*c4+b8*c5
171                                                                                  //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
172         res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
173         res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
174         res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b);
175         res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b);
176         res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
177         res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
178 
179         tmp = ((pu1_src[18] + pu1_src[19]) << 2) - pu1_src[17] - pu1_src[20];
180         tmp = pu1_src[16] + pu1_src[21] + (tmp << 2) + tmp;
181 
182         res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);                    //shifting right by 5 bits.
183         res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);
184         tmp = (tmp + 16) >> 5;
185 
186         src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
187         pu1_dst[16] = CLIP_U8(tmp);
188 
189         _mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b);
190 
191         ht--;
192         pu1_src += src_strd;
193         pu1_dst += dst_strd;
194     }
195     while(ht > 0);
196 }
197 
198 /*
199 *******************************************************************************
200 *
201 * @brief
202 *   This function implements a two stage cascaded six tap filter. It
203 *    applies the six tap filter in the vertical direction on the
204 *    predictor values, followed by applying the same filter in the
205 *    horizontal direction on the output of the first stage. The six tap
206 *    filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
207 *    interpolation process" (Filter run for width = 17 and height =17)
208 *
209 * @par Description:
210 *    The function interpolates the predictors first in the vertical direction
211 *    and then in the horizontal direction to output the (1/2,1/2). The output
212 *    of the first stage of the filter is stored in the buffer pointed to by
213 *    pi16_pred1(only in C) in 16 bit precision.
214 *
215 * @param[in] pu1_src
216 *  UWORD8 pointer to the source
217 *
218 * @param[out] pu1_dst1
219 *  UWORD8 pointer to the destination(Vertical filtered output)
220 *
221 * @param[out] pu1_dst2
222 *  UWORD8 pointer to the destination(out put after applying horizontal filter
223 *  to the intermediate vertical output)
224 *
225 * @param[in] src_strd
226 *  integer source stride
227 
228 * @param[in] dst_strd
229 *  integer destination stride of pu1_dst
230 *
231 * @param[in]pi16_pred1
232 *  Pointer to 16bit intermediate buffer(used only in c)
233 *
234 * @param[in] pi16_pred1_strd
235 *  integer destination stride of pi16_pred1
236 *
237 * @returns
238 *  none
239 *
240 * @remarks
241 *  none
242 *
243 *******************************************************************************
244 */
ih264e_sixtap_filter_2dvh_vert_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_dst1,UWORD8 * pu1_dst2,WORD32 src_strd,WORD32 dst_strd,WORD32 * pi4_pred1,WORD32 pred1_strd)245 void ih264e_sixtap_filter_2dvh_vert_ssse3(UWORD8 *pu1_src,
246                                           UWORD8 *pu1_dst1,
247                                           UWORD8 *pu1_dst2,
248                                           WORD32 src_strd,
249                                           WORD32 dst_strd,
250                                           WORD32 *pi4_pred1,
251                                           WORD32 pred1_strd)
252 {
253     WORD32 ht;
254     WORD16 *pi2_pred1;
255 
256     ht = 17;
257     pi2_pred1 = (WORD16 *)pi4_pred1;
258     pred1_strd = pred1_strd << 1;
259 
260     // Vertical 6-tap filter
261     {
262         __m128i src1_r0_16x8b, src1_r1_16x8b, src1_r2_16x8b;
263         __m128i src1_r3_16x8b, src1_r4_16x8b, src1_r5_16x8b;
264         __m128i src2_r0_16x8b, src2_r1_16x8b, src2_r2_16x8b;
265         __m128i src2_r3_16x8b, src2_r4_16x8b, src2_r5_16x8b;
266 
267         __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
268 
269         __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
270         __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
271 
272         coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
273         coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
274         coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
275                                                      //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
276 
277         pu1_src -= 2;
278         pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3])
279 
280         // Loading first five rows to start first row processing.
281         // 22 values loaded in each row.
282         src1_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
283         src2_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
284         pu1_src += src_strd;
285 
286         src1_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
287         src2_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
288         pu1_src += src_strd;
289 
290         src1_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
291         src2_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
292         pu1_src += src_strd;
293 
294         src1_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
295         src2_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
296         pu1_src += src_strd;
297 
298         src1_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
299         src2_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
300         pu1_src += src_strd;
301 
302         do
303         {
304             src1_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
305             src2_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
306 
307             src_r0r1_16x8b = _mm_unpacklo_epi8(src1_r0_16x8b, src1_r1_16x8b);
308             src_r2r3_16x8b = _mm_unpacklo_epi8(src1_r2_16x8b, src1_r3_16x8b);
309             src_r4r5_16x8b = _mm_unpacklo_epi8(src1_r4_16x8b, src1_r5_16x8b);
310 
311             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
312             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
313             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
314 
315             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
316             res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
317 
318             _mm_storeu_si128((__m128i *)pi2_pred1, res_t1_8x16b);
319 
320             src_r0r1_16x8b = _mm_unpackhi_epi8(src1_r0_16x8b, src1_r1_16x8b);
321             src_r2r3_16x8b = _mm_unpackhi_epi8(src1_r2_16x8b, src1_r3_16x8b);
322             src_r4r5_16x8b = _mm_unpackhi_epi8(src1_r4_16x8b, src1_r5_16x8b);
323 
324             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
325             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
326             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
327 
328             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
329             res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
330 
331             _mm_storeu_si128((__m128i *)(pi2_pred1 + 8), res_t1_8x16b);
332 
333             src_r0r1_16x8b = _mm_unpacklo_epi8(src2_r0_16x8b, src2_r1_16x8b);
334             src_r2r3_16x8b = _mm_unpacklo_epi8(src2_r2_16x8b, src2_r3_16x8b);
335             src_r4r5_16x8b = _mm_unpacklo_epi8(src2_r4_16x8b, src2_r5_16x8b);
336 
337             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
338             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
339             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
340 
341             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
342             res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
343 
344             _mm_storeu_si128((__m128i *)(pi2_pred1 + 14), res_t1_8x16b);
345 
346             src1_r0_16x8b = src1_r1_16x8b;
347             src1_r1_16x8b = src1_r2_16x8b;
348             src1_r2_16x8b = src1_r3_16x8b;
349             src1_r3_16x8b = src1_r4_16x8b;
350             src1_r4_16x8b = src1_r5_16x8b;
351 
352             src2_r0_16x8b = src2_r1_16x8b;
353             src2_r1_16x8b = src2_r2_16x8b;
354             src2_r2_16x8b = src2_r3_16x8b;
355             src2_r3_16x8b = src2_r4_16x8b;
356             src2_r4_16x8b = src2_r5_16x8b;
357 
358             ht--;
359             pu1_src += src_strd;
360             pi2_pred1 += pred1_strd;
361         }
362         while(ht > 0);
363     }
364 
365     ht = 17;
366     pi2_pred1 = (WORD16 *)pi4_pred1;
367 
368     // Horizontal 6-tap filter
369     {
370         WORD32 temp;
371 
372         __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
373         __m128i src_r4_8x16b, src_r5_8x16b;
374         __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
375         __m128i res_vert1_8x16b, res_vert2_8x16b, res_16x8b;
376 
377         __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
378         __m128i res_c0_8x16b, res_c1_8x16b;
379 
380         __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
381         __m128i const_val512_4x32b, const_val16_8x16b;
382 
383         coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); //c0 c1 c0 c1 c0 c1 c0 c1
384         coeff2_3_8x16b = _mm_set1_epi32(0x00140014); //c2 c3 c2 c3 c2 c3 c2 c3
385         coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); //c4 c5 c4 c5 c4 c5 c4 c5
386                                                      //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
387         const_val512_4x32b = _mm_set1_epi32(512);
388         const_val16_8x16b = _mm_set1_epi16(16);
389 
390         do
391         {
392             src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1));
393             src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 1));
394             src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 2));
395             src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 3));
396             src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 4));
397             src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 5));
398 
399             res_vert1_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b);
400             res_vert1_8x16b = _mm_srai_epi16(res_vert1_8x16b, 5); //shifting right by 5 bits.
401 
402             src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
403             src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
404             src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
405 
406             res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
407             res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
408             res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
409 
410             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
411             res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
412             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
413             res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
414 
415             src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
416             src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
417             src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
418 
419             res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
420             res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
421             res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
422 
423             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
424             res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
425             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
426             res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
427 
428             res_c0_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
429 
430             src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8));
431             src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 1));
432             src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 2));
433             src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 3));
434             src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 4));
435             src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 5));
436 
437             res_vert2_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b);
438             res_vert2_8x16b = _mm_srai_epi16(res_vert2_8x16b, 5); //shifting right by 5 bits.
439 
440             src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
441             src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
442             src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
443 
444             res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
445             res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
446             res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
447 
448             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
449             res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
450             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
451             res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b ,10);
452 
453             src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
454             src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
455             src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
456 
457             res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
458             res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
459             res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
460 
461             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
462             res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
463             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
464             res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
465 
466             res_c1_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
467 
468             res_16x8b = _mm_packus_epi16(res_vert1_8x16b, res_vert2_8x16b);
469             _mm_storeu_si128((__m128i *)pu1_dst1, res_16x8b);
470             pu1_dst1[16] = CLIP_U8((pi2_pred1[18] + 16) >> 5);
471 
472             res_16x8b = _mm_packus_epi16(res_c0_8x16b, res_c1_8x16b);
473             _mm_storeu_si128((__m128i *)pu1_dst2, res_16x8b);
474             temp = ((pi2_pred1[18] + pi2_pred1[19]) << 2) - pi2_pred1[17] - pi2_pred1[20];
475             temp = pi2_pred1[16] + pi2_pred1[21] + (temp << 2) + temp;
476             pu1_dst2[16] = CLIP_U8((temp + 512) >> 10);
477 
478             ht--;
479             pi2_pred1 += pred1_strd;
480             pu1_dst1 += dst_strd;
481             pu1_dst2 += dst_strd;
482         }
483         while(ht > 0);
484     }
485 }
486