xref: /aosp_15_r20/external/libavc/encoder/x86/svc/isvce_residual_pred_sse42.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 
21 /**
22 *******************************************************************************
23 *
24 * @file
25 *  isvce_residual_pred_sse42.c
26 *
27 * @brief
28 *  Contains functions
29 * used for SVC residual
30 * prediction
31 *
32 *******************************************************************************
33 */
34 #include <immintrin.h>
35 
36 #include "ih264_typedefs.h"
37 #include "ih264_macros.h"
38 #include "isvc_structs.h"
39 
isvce_luma_residual_sampler_2x_sse42(coordinates_t * ps_ref_array_positions,coordinates_t * ps_ref_array_phases,buffer_container_t * ps_inp,buffer_container_t * ps_out,buffer_container_t * ps_scratch,UWORD32 u4_ref_nnz,UWORD8 u1_ref_tx_size)40 void isvce_luma_residual_sampler_2x_sse42(coordinates_t *ps_ref_array_positions,
41                                           coordinates_t *ps_ref_array_phases,
42                                           buffer_container_t *ps_inp, buffer_container_t *ps_out,
43                                           buffer_container_t *ps_scratch, UWORD32 u4_ref_nnz,
44                                           UWORD8 u1_ref_tx_size)
45 {
46     WORD16 *pi2_inp_data = (WORD16 *) ps_inp->pv_data;
47     WORD16 *pi2_out_res = (WORD16 *) ps_out->pv_data;
48     WORD32 i4_inp_data_stride = ps_inp->i4_data_stride;
49     WORD32 i4_out_res_stride = ps_out->i4_data_stride;
50     WORD16 *pi2_refarray_buffer = (WORD16 *) ps_scratch->pv_data;
51     WORD32 i4_blk_ctr;
52 
53     UNUSED(ps_ref_array_positions);
54     UNUSED(ps_ref_array_phases);
55 
56     /* For 2x scaling, offsets always point to TL pixel outside MB */
57     /* Hence, refTransBlkIdc will be different and since phase */
58     /* for first refArray pos for horiz filtering samples > 8, */
59     /* first row and first column from the refArray is never used */
60     pi2_inp_data += 1 + i4_inp_data_stride;
61 
62     if((u1_ref_tx_size) && (0 != u4_ref_nnz))
63     {
64         WORD16 *pi2_ref_data_byte;
65         WORD32 i4_i, i4_j;
66         WORD16 *pi2_refarray_buffer_tmp = pi2_refarray_buffer;
67 
68         __m128i i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1;
69         __m128i res_8x16b_r1_0, res_8x16b_r1_1;
70         __m128i final_res_8x16b_r1_0, final_res_8x16b_r1_1;
71         __m128i coeff_add_8x16b_r1;
72         __m128i coeff_add_8x16b_r2;
73         __m128i i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1;
74         __m128i res_8x16b_r2_0, res_8x16b_r2_1;
75         __m128i final_res_8x16b_r2_0, final_res_8x16b_r2_1;
76 
77         pi2_ref_data_byte = pi2_inp_data;
78 
79         /* ----------- Horizontal Interpolation ---------------- */
80         for(i4_i = 0; i4_i < BLK8x8SIZE; i4_i += 2)
81         {
82             /* a0 a1 a2 a3 a4 a5 a6 a7 */
83             i2_coeff_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_ref_data_byte);
84             /* b0 b1 b2 b3 b4 b5 b6 b7 */
85             i2_coeff_8x16b_r2_0 =
86                 _mm_loadu_si128((__m128i *) (pi2_ref_data_byte + i4_inp_data_stride));
87 
88             /* a1 a2 a3 a4 a5 a6 a7 0 */
89             i2_coeff_8x16b_r1_1 = _mm_srli_si128(i2_coeff_8x16b_r1_0, 2);
90             /* b1 b2 b3 b4 b5 b6 b7 0 */
91             i2_coeff_8x16b_r2_1 = _mm_srli_si128(i2_coeff_8x16b_r2_0, 2);
92 
93             coeff_add_8x16b_r1 = _mm_add_epi16(i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1);
94             coeff_add_8x16b_r2 = _mm_add_epi16(i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1);
95 
96             i2_coeff_8x16b_r1_0 = _mm_slli_epi16(i2_coeff_8x16b_r1_0, 1);
97             i2_coeff_8x16b_r2_0 = _mm_slli_epi16(i2_coeff_8x16b_r2_0, 1);
98 
99             i2_coeff_8x16b_r1_1 = _mm_slli_epi16(i2_coeff_8x16b_r1_1, 1);
100             i2_coeff_8x16b_r2_1 = _mm_slli_epi16(i2_coeff_8x16b_r2_1, 1);
101 
102             res_8x16b_r1_0 = _mm_add_epi16(i2_coeff_8x16b_r1_0, coeff_add_8x16b_r1);
103             res_8x16b_r2_0 = _mm_add_epi16(i2_coeff_8x16b_r2_0, coeff_add_8x16b_r2);
104 
105             res_8x16b_r1_1 = _mm_add_epi16(i2_coeff_8x16b_r1_1, coeff_add_8x16b_r1);
106             res_8x16b_r2_1 = _mm_add_epi16(i2_coeff_8x16b_r2_1, coeff_add_8x16b_r2);
107 
108             final_res_8x16b_r1_0 = _mm_unpacklo_epi16(res_8x16b_r1_0, res_8x16b_r1_1);
109             final_res_8x16b_r2_0 = _mm_unpacklo_epi16(res_8x16b_r2_0, res_8x16b_r2_1);
110 
111             final_res_8x16b_r1_1 = _mm_unpackhi_epi16(res_8x16b_r1_0, res_8x16b_r1_1);
112             final_res_8x16b_r2_1 = _mm_unpackhi_epi16(res_8x16b_r2_0, res_8x16b_r2_1);
113 
114             _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 1), final_res_8x16b_r1_0);
115             _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 9), final_res_8x16b_r1_1);
116 
117             _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 17), final_res_8x16b_r2_0);
118             _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 25), final_res_8x16b_r2_1);
119 
120             pi2_refarray_buffer[0] = (pi2_ref_data_byte[0] << 2);
121             pi2_refarray_buffer[15] = (pi2_ref_data_byte[7] << 2);
122             pi2_ref_data_byte += i4_inp_data_stride;
123             pi2_refarray_buffer[16] = (pi2_ref_data_byte[0] << 2);
124             pi2_refarray_buffer[31] = (pi2_ref_data_byte[7] << 2);
125 
126             /* vertical loop updates */
127             pi2_ref_data_byte = pi2_inp_data + ((i4_i + 2) * i4_inp_data_stride);
128             pi2_refarray_buffer += 32;
129         }
130 
131         /* ----------- Vertical Interpolation ---------------- */
132         pi2_refarray_buffer = pi2_refarray_buffer_tmp;
133 
134         {
135             __m128i i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r1_2, i4_horz_samp_4x32b_r1_3,
136                 i4_horz_samp_4x32b_r1_4;
137             __m128i i4_horz_samp_4x32b_r2_1, i4_horz_samp_4x32b_r2_2, i4_horz_samp_4x32b_r2_3,
138                 i4_horz_samp_4x32b_r2_4;
139             __m128i i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2, i4_res_samp_4x32b_r1_3,
140                 i4_res_samp_4x32b_r1_4;
141             __m128i i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2, i4_res_samp_4x32b_r2_3,
142                 i4_res_samp_4x32b_r2_4;
143             __m128i horz_add_4x32b_r2_1, horz_add_4x32b_r2_2, horz_add_4x32b_r2_3,
144                 horz_add_4x32b_r2_4;
145 
146             __m128i i4_horz_samp_8x16b_r1_1, i4_horz_samp_8x16b_r2_1;
147             __m128i i4_horz_samp_8x16b_r1_2, i4_horz_samp_8x16b_r2_2;
148             __m128i i4_horz_samp_8x16b_r1_3, i4_horz_samp_8x16b_r2_3;
149             __m128i i4_horz_samp_8x16b_r1_4, i4_horz_samp_8x16b_r2_4;
150 
151             __m128i twos = _mm_set1_epi32(2);
152             __m128i eights = _mm_set1_epi32(8);
153 
154             WORD16 *pi2_out;
155             pi2_out = pi2_out_res;
156 
157             i4_horz_samp_8x16b_r1_1 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer));
158             i4_horz_samp_8x16b_r1_2 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 4));
159             i4_horz_samp_8x16b_r1_3 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 8));
160             i4_horz_samp_8x16b_r1_4 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 12));
161 
162             i4_horz_samp_4x32b_r1_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_1);
163             i4_horz_samp_4x32b_r1_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_2);
164             i4_horz_samp_4x32b_r1_3 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_3);
165             i4_horz_samp_4x32b_r1_4 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_4);
166 
167             /* populate the first inter sample */
168             i4_res_samp_4x32b_r1_1 =
169                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_1, twos), 2);
170             i4_res_samp_4x32b_r1_2 =
171                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_2, twos), 2);
172             i4_res_samp_4x32b_r1_3 =
173                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_3, twos), 2);
174             i4_res_samp_4x32b_r1_4 =
175                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_4, twos), 2);
176 
177             _mm_storeu_si128((__m128i *) pi2_out,
178                              _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
179             _mm_storeu_si128((__m128i *) (pi2_out + 8),
180                              _mm_packs_epi32(i4_res_samp_4x32b_r1_3, i4_res_samp_4x32b_r1_4));
181             pi2_out += i4_out_res_stride;
182 
183             for(i4_j = 0; i4_j < 14; i4_j += 2)
184             {
185                 pi2_refarray_buffer += MB_SIZE;
186 
187                 i4_horz_samp_8x16b_r2_1 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer));
188                 i4_horz_samp_8x16b_r2_2 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 4));
189                 i4_horz_samp_8x16b_r2_3 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 8));
190                 i4_horz_samp_8x16b_r2_4 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 12));
191 
192                 i4_horz_samp_4x32b_r2_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_1);
193                 i4_horz_samp_4x32b_r2_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_2);
194                 i4_horz_samp_4x32b_r2_3 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_3);
195                 i4_horz_samp_4x32b_r2_4 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_4);
196 
197                 horz_add_4x32b_r2_1 =
198                     _mm_add_epi32(i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r2_1);
199                 horz_add_4x32b_r2_2 =
200                     _mm_add_epi32(i4_horz_samp_4x32b_r1_2, i4_horz_samp_4x32b_r2_2);
201                 horz_add_4x32b_r2_3 =
202                     _mm_add_epi32(i4_horz_samp_4x32b_r1_3, i4_horz_samp_4x32b_r2_3);
203                 horz_add_4x32b_r2_4 =
204                     _mm_add_epi32(i4_horz_samp_4x32b_r1_4, i4_horz_samp_4x32b_r2_4);
205 
206                 i4_res_samp_4x32b_r1_1 =
207                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_1, 1), horz_add_4x32b_r2_1);
208                 i4_res_samp_4x32b_r1_2 =
209                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_2, 1), horz_add_4x32b_r2_2);
210                 i4_res_samp_4x32b_r1_3 =
211                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_3, 1), horz_add_4x32b_r2_3);
212                 i4_res_samp_4x32b_r1_4 =
213                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_4, 1), horz_add_4x32b_r2_4);
214 
215                 i4_res_samp_4x32b_r2_1 =
216                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_1, 1), horz_add_4x32b_r2_1);
217                 i4_res_samp_4x32b_r2_2 =
218                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_2, 1), horz_add_4x32b_r2_2);
219                 i4_res_samp_4x32b_r2_3 =
220                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_3, 1), horz_add_4x32b_r2_3);
221                 i4_res_samp_4x32b_r2_4 =
222                     _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_4, 1), horz_add_4x32b_r2_4);
223 
224                 i4_res_samp_4x32b_r1_1 =
225                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_1, eights), 4);
226                 i4_res_samp_4x32b_r1_2 =
227                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_2, eights), 4);
228                 i4_res_samp_4x32b_r1_3 =
229                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_3, eights), 4);
230                 i4_res_samp_4x32b_r1_4 =
231                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_4, eights), 4);
232 
233                 i4_res_samp_4x32b_r2_1 =
234                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_1, eights), 4);
235                 i4_res_samp_4x32b_r2_2 =
236                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_2, eights), 4);
237                 i4_res_samp_4x32b_r2_3 =
238                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_3, eights), 4);
239                 i4_res_samp_4x32b_r2_4 =
240                     _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_4, eights), 4);
241 
242                 /* populate 2 samples based on current coeffs */
243                 _mm_storeu_si128((__m128i *) pi2_out,
244                                  _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
245                 _mm_storeu_si128((__m128i *) (pi2_out + 8),
246                                  _mm_packs_epi32(i4_res_samp_4x32b_r1_3, i4_res_samp_4x32b_r1_4));
247                 pi2_out += i4_out_res_stride;
248 
249                 _mm_storeu_si128((__m128i *) pi2_out,
250                                  _mm_packs_epi32(i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2));
251                 _mm_storeu_si128((__m128i *) (pi2_out + 8),
252                                  _mm_packs_epi32(i4_res_samp_4x32b_r2_3, i4_res_samp_4x32b_r2_4));
253                 pi2_out += i4_out_res_stride;
254 
255                 /* store the coeff 2 to coeff 1 */
256                 /* (used in next iteration)     */
257                 i4_horz_samp_4x32b_r1_1 = i4_horz_samp_4x32b_r2_1;
258                 i4_horz_samp_4x32b_r1_2 = i4_horz_samp_4x32b_r2_2;
259                 i4_horz_samp_4x32b_r1_3 = i4_horz_samp_4x32b_r2_3;
260                 i4_horz_samp_4x32b_r1_4 = i4_horz_samp_4x32b_r2_4;
261             }
262 
263             i4_res_samp_4x32b_r1_1 =
264                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_1, twos), 2);
265             i4_res_samp_4x32b_r1_2 =
266                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_2, twos), 2);
267             i4_res_samp_4x32b_r1_3 =
268                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_3, twos), 2);
269             i4_res_samp_4x32b_r1_4 =
270                 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_4, twos), 2);
271 
272             _mm_storeu_si128((__m128i *) pi2_out,
273                              _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
274             _mm_storeu_si128((__m128i *) (pi2_out + 8),
275                              _mm_packs_epi32(i4_res_samp_4x32b_r1_3, i4_res_samp_4x32b_r1_4));
276         }
277     }
278     else
279     {
280         /* ----------------------------------------------------------------- */
281         /* LOOP over number of blocks                                        */
282         /* ----------------------------------------------------------------- */
283         for(i4_blk_ctr = 0; i4_blk_ctr < 4; i4_blk_ctr++)
284         {
285             /* if reference layer is not coded then no processing */
286             if(0 != (u4_ref_nnz & 0x1))
287             {
288                 {
289                     __m128i i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1;
290                     __m128i i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1;
291                     __m128i i2_coeff_8x16b_r3_0, i2_coeff_8x16b_r3_1;
292                     __m128i i2_coeff_8x16b_r4_0, i2_coeff_8x16b_r4_1;
293 
294                     __m128i res_8x16b_r1_0, res_8x16b_r1_1;
295                     __m128i res_8x16b_r2_0, res_8x16b_r2_1;
296                     __m128i res_8x16b_r3_0, res_8x16b_r3_1;
297                     __m128i res_8x16b_r4_0, res_8x16b_r4_1;
298                     __m128i final_res_8x16b_r1_0;
299                     __m128i final_res_8x16b_r2_0;
300                     __m128i final_res_8x16b_r3_0;
301                     __m128i final_res_8x16b_r4_0;
302 
303                     __m128i coeff_add_8x16b_r1;
304                     __m128i coeff_add_8x16b_r2;
305                     __m128i coeff_add_8x16b_r3;
306                     __m128i coeff_add_8x16b_r4;
307 
308                     /* ----------- Horizontal Interpolation ---------------- */
309                     {
310                         /* a0 a1 a2 a3 a4 a5 a6 a7 */
311                         i2_coeff_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_inp_data);
312                         /* b0 b1 b2 b3 b4 b5 b6 b7 */
313                         i2_coeff_8x16b_r2_0 =
314                             _mm_loadu_si128((__m128i *) (pi2_inp_data + i4_inp_data_stride));
315                         i2_coeff_8x16b_r3_0 =
316                             _mm_loadu_si128((__m128i *) (pi2_inp_data + (i4_inp_data_stride << 1)));
317                         i2_coeff_8x16b_r4_0 =
318                             _mm_loadu_si128((__m128i *) (pi2_inp_data + (i4_inp_data_stride * 3)));
319 
320                         /* a1 a2 a3 a4 a5 a6 a7 0 */
321                         i2_coeff_8x16b_r1_1 = _mm_srli_si128(i2_coeff_8x16b_r1_0, 2);
322                         /* b1 b2 b3 b4 b5 b6 b7 0 */
323                         i2_coeff_8x16b_r2_1 = _mm_srli_si128(i2_coeff_8x16b_r2_0, 2);
324                         i2_coeff_8x16b_r3_1 = _mm_srli_si128(i2_coeff_8x16b_r3_0, 2);
325                         i2_coeff_8x16b_r4_1 = _mm_srli_si128(i2_coeff_8x16b_r4_0, 2);
326 
327                         coeff_add_8x16b_r1 =
328                             _mm_add_epi16(i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1);
329                         coeff_add_8x16b_r2 =
330                             _mm_add_epi16(i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1);
331                         coeff_add_8x16b_r3 =
332                             _mm_add_epi16(i2_coeff_8x16b_r3_0, i2_coeff_8x16b_r3_1);
333                         coeff_add_8x16b_r4 =
334                             _mm_add_epi16(i2_coeff_8x16b_r4_0, i2_coeff_8x16b_r4_1);
335 
336                         i2_coeff_8x16b_r1_0 = _mm_slli_epi16(i2_coeff_8x16b_r1_0, 1);
337                         i2_coeff_8x16b_r2_0 = _mm_slli_epi16(i2_coeff_8x16b_r2_0, 1);
338                         i2_coeff_8x16b_r3_0 = _mm_slli_epi16(i2_coeff_8x16b_r3_0, 1);
339                         i2_coeff_8x16b_r4_0 = _mm_slli_epi16(i2_coeff_8x16b_r4_0, 1);
340 
341                         i2_coeff_8x16b_r1_1 = _mm_slli_epi16(i2_coeff_8x16b_r1_1, 1);
342                         i2_coeff_8x16b_r2_1 = _mm_slli_epi16(i2_coeff_8x16b_r2_1, 1);
343                         i2_coeff_8x16b_r3_1 = _mm_slli_epi16(i2_coeff_8x16b_r3_1, 1);
344                         i2_coeff_8x16b_r4_1 = _mm_slli_epi16(i2_coeff_8x16b_r4_1, 1);
345 
346                         res_8x16b_r1_0 = _mm_add_epi16(i2_coeff_8x16b_r1_0, coeff_add_8x16b_r1);
347                         res_8x16b_r2_0 = _mm_add_epi16(i2_coeff_8x16b_r2_0, coeff_add_8x16b_r2);
348                         res_8x16b_r3_0 = _mm_add_epi16(i2_coeff_8x16b_r3_0, coeff_add_8x16b_r3);
349                         res_8x16b_r4_0 = _mm_add_epi16(i2_coeff_8x16b_r4_0, coeff_add_8x16b_r4);
350 
351                         res_8x16b_r1_1 = _mm_add_epi16(i2_coeff_8x16b_r1_1, coeff_add_8x16b_r1);
352                         res_8x16b_r2_1 = _mm_add_epi16(i2_coeff_8x16b_r2_1, coeff_add_8x16b_r2);
353                         res_8x16b_r3_1 = _mm_add_epi16(i2_coeff_8x16b_r3_1, coeff_add_8x16b_r3);
354                         res_8x16b_r4_1 = _mm_add_epi16(i2_coeff_8x16b_r4_1, coeff_add_8x16b_r4);
355 
356                         final_res_8x16b_r1_0 = _mm_unpacklo_epi16(res_8x16b_r1_0, res_8x16b_r1_1);
357                         final_res_8x16b_r2_0 = _mm_unpacklo_epi16(res_8x16b_r2_0, res_8x16b_r2_1);
358                         final_res_8x16b_r3_0 = _mm_unpacklo_epi16(res_8x16b_r3_0, res_8x16b_r3_1);
359                         final_res_8x16b_r4_0 = _mm_unpacklo_epi16(res_8x16b_r4_0, res_8x16b_r4_1);
360 
361                         _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 1),
362                                          final_res_8x16b_r1_0);
363                         _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 9),
364                                          final_res_8x16b_r2_0);
365                         _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 17),
366                                          final_res_8x16b_r3_0);
367                         _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 25),
368                                          final_res_8x16b_r4_0);
369 
370                         pi2_refarray_buffer[0] = (pi2_inp_data[0] << 2);
371                         pi2_refarray_buffer[7] = (pi2_inp_data[3] << 2);
372                         pi2_refarray_buffer[8] = (pi2_inp_data[i4_inp_data_stride] << 2);
373                         pi2_refarray_buffer[15] = (pi2_inp_data[i4_inp_data_stride + 3] << 2);
374                         pi2_refarray_buffer[16] = (pi2_inp_data[(i4_inp_data_stride << 1)] << 2);
375                         pi2_refarray_buffer[23] =
376                             (pi2_inp_data[(i4_inp_data_stride << 1) + 3] << 2);
377                         pi2_refarray_buffer[24] = (pi2_inp_data[(i4_inp_data_stride * 3)] << 2);
378                         pi2_refarray_buffer[31] = (pi2_inp_data[(i4_inp_data_stride * 3) + 3] << 2);
379                     }
380 
381                     /* ----------- Vertical Interpolation ---------------- */
382                     {
383                         __m128i i4_horz_samp_8x16b_r0_1, i4_horz_samp_8x16b_r0_2;
384                         __m128i i4_horz_samp_8x16b_r1_1, i4_horz_samp_8x16b_r1_2;
385                         __m128i i4_horz_samp_8x16b_r2_1, i4_horz_samp_8x16b_r2_2;
386                         __m128i i4_horz_samp_8x16b_r3_1, i4_horz_samp_8x16b_r3_2;
387 
388                         __m128i i4_horz_samp_4x32b_r0_1, i4_horz_samp_4x32b_r0_2;
389                         __m128i i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r1_2;
390                         __m128i i4_horz_samp_4x32b_r2_1, i4_horz_samp_4x32b_r2_2;
391                         __m128i i4_horz_samp_4x32b_r3_1, i4_horz_samp_4x32b_r3_2;
392 
393                         __m128i i4_res_samp_4x32b_r0_1, i4_res_samp_4x32b_r0_2;
394                         __m128i i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2;
395                         __m128i i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2;
396                         __m128i i4_res_samp_4x32b_r3_1, i4_res_samp_4x32b_r3_2;
397                         __m128i i4_res_samp_4x32b_r4_1, i4_res_samp_4x32b_r4_2;
398                         __m128i i4_res_samp_4x32b_r5_1, i4_res_samp_4x32b_r5_2;
399                         __m128i i4_res_samp_4x32b_r6_1, i4_res_samp_4x32b_r6_2;
400                         __m128i i4_res_samp_4x32b_r7_1, i4_res_samp_4x32b_r7_2;
401 
402                         __m128i horz_add_4x32b_r1_1, horz_add_4x32b_r1_2;
403                         __m128i horz_add_4x32b_r2_1, horz_add_4x32b_r2_2;
404                         __m128i horz_add_4x32b_r3_1, horz_add_4x32b_r3_2;
405 
406                         __m128i twos = _mm_set1_epi32(2);
407                         __m128i eights = _mm_set1_epi32(8);
408 
409                         i4_horz_samp_8x16b_r0_1 =
410                             _mm_loadu_si128((__m128i *) (pi2_refarray_buffer));
411                         i4_horz_samp_8x16b_r0_2 =
412                             _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 4));
413                         i4_horz_samp_8x16b_r1_1 =
414                             _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + BLK8x8SIZE));
415                         i4_horz_samp_8x16b_r1_2 =
416                             _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + BLK8x8SIZE + 4));
417                         i4_horz_samp_8x16b_r2_1 =
418                             _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + (BLK8x8SIZE << 1)));
419                         i4_horz_samp_8x16b_r2_2 = _mm_loadu_si128(
420                             (__m128i *) (pi2_refarray_buffer + (BLK8x8SIZE << 1) + 4));
421                         i4_horz_samp_8x16b_r3_1 =
422                             _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + (BLK8x8SIZE * 3)));
423                         i4_horz_samp_8x16b_r3_2 = _mm_loadu_si128(
424                             (__m128i *) (pi2_refarray_buffer + (BLK8x8SIZE * 3) + 4));
425 
426                         i4_horz_samp_4x32b_r0_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r0_1);
427                         i4_horz_samp_4x32b_r0_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r0_2);
428                         i4_horz_samp_4x32b_r1_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_1);
429                         i4_horz_samp_4x32b_r1_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_2);
430                         i4_horz_samp_4x32b_r2_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_1);
431                         i4_horz_samp_4x32b_r2_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_2);
432                         i4_horz_samp_4x32b_r3_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r3_1);
433                         i4_horz_samp_4x32b_r3_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r3_2);
434 
435                         horz_add_4x32b_r1_1 =
436                             _mm_add_epi32(i4_horz_samp_4x32b_r0_1, i4_horz_samp_4x32b_r1_1);
437                         horz_add_4x32b_r2_1 =
438                             _mm_add_epi32(i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r2_1);
439                         horz_add_4x32b_r3_1 =
440                             _mm_add_epi32(i4_horz_samp_4x32b_r2_1, i4_horz_samp_4x32b_r3_1);
441 
442                         horz_add_4x32b_r1_2 =
443                             _mm_add_epi32(i4_horz_samp_4x32b_r0_2, i4_horz_samp_4x32b_r1_2);
444                         horz_add_4x32b_r2_2 =
445                             _mm_add_epi32(i4_horz_samp_4x32b_r1_2, i4_horz_samp_4x32b_r2_2);
446                         horz_add_4x32b_r3_2 =
447                             _mm_add_epi32(i4_horz_samp_4x32b_r2_2, i4_horz_samp_4x32b_r3_2);
448 
449                         i4_res_samp_4x32b_r1_1 = _mm_add_epi32(
450                             _mm_slli_epi32(i4_horz_samp_4x32b_r0_1, 1), horz_add_4x32b_r1_1);
451                         i4_res_samp_4x32b_r2_1 = _mm_add_epi32(
452                             _mm_slli_epi32(i4_horz_samp_4x32b_r1_1, 1), horz_add_4x32b_r1_1);
453                         i4_res_samp_4x32b_r3_1 = _mm_add_epi32(
454                             _mm_slli_epi32(i4_horz_samp_4x32b_r1_1, 1), horz_add_4x32b_r2_1);
455                         i4_res_samp_4x32b_r4_1 = _mm_add_epi32(
456                             _mm_slli_epi32(i4_horz_samp_4x32b_r2_1, 1), horz_add_4x32b_r2_1);
457                         i4_res_samp_4x32b_r5_1 = _mm_add_epi32(
458                             _mm_slli_epi32(i4_horz_samp_4x32b_r2_1, 1), horz_add_4x32b_r3_1);
459                         i4_res_samp_4x32b_r6_1 = _mm_add_epi32(
460                             _mm_slli_epi32(i4_horz_samp_4x32b_r3_1, 1), horz_add_4x32b_r3_1);
461 
462                         i4_res_samp_4x32b_r1_2 = _mm_add_epi32(
463                             _mm_slli_epi32(i4_horz_samp_4x32b_r0_2, 1), horz_add_4x32b_r1_2);
464                         i4_res_samp_4x32b_r2_2 = _mm_add_epi32(
465                             _mm_slli_epi32(i4_horz_samp_4x32b_r1_2, 1), horz_add_4x32b_r1_2);
466                         i4_res_samp_4x32b_r3_2 = _mm_add_epi32(
467                             _mm_slli_epi32(i4_horz_samp_4x32b_r1_2, 1), horz_add_4x32b_r2_2);
468                         i4_res_samp_4x32b_r4_2 = _mm_add_epi32(
469                             _mm_slli_epi32(i4_horz_samp_4x32b_r2_2, 1), horz_add_4x32b_r2_2);
470                         i4_res_samp_4x32b_r5_2 = _mm_add_epi32(
471                             _mm_slli_epi32(i4_horz_samp_4x32b_r2_2, 1), horz_add_4x32b_r3_2);
472                         i4_res_samp_4x32b_r6_2 = _mm_add_epi32(
473                             _mm_slli_epi32(i4_horz_samp_4x32b_r3_2, 1), horz_add_4x32b_r3_2);
474 
475                         i4_res_samp_4x32b_r0_1 =
476                             _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r0_1, twos), 2);
477                         i4_res_samp_4x32b_r1_1 =
478                             _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_1, eights), 4);
479                         i4_res_samp_4x32b_r2_1 =
480                             _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_1, eights), 4);
481                         i4_res_samp_4x32b_r3_1 =
482                             _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r3_1, eights), 4);
483                         i4_res_samp_4x32b_r4_1 =
484                             _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r4_1, eights), 4);
485                         i4_res_samp_4x32b_r5_1 =
486                             _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r5_1, eights), 4);
487                         i4_res_samp_4x32b_r6_1 =
488                             _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r6_1, eights), 4);
489                         i4_res_samp_4x32b_r7_1 =
490                             _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r3_1, twos), 2);
491 
492                         i4_res_samp_4x32b_r0_2 =
493                             _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r0_2, twos), 2);
494                         i4_res_samp_4x32b_r1_2 =
495                             _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_2, eights), 4);
496                         i4_res_samp_4x32b_r2_2 =
497                             _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_2, eights), 4);
498                         i4_res_samp_4x32b_r3_2 =
499                             _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r3_2, eights), 4);
500                         i4_res_samp_4x32b_r4_2 =
501                             _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r4_2, eights), 4);
502                         i4_res_samp_4x32b_r5_2 =
503                             _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r5_2, eights), 4);
504                         i4_res_samp_4x32b_r6_2 =
505                             _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r6_2, eights), 4);
506                         i4_res_samp_4x32b_r7_2 =
507                             _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r3_2, twos), 2);
508 
509                         /* populate 2 samples based on current coeffs */
510                         _mm_storeu_si128(
511                             (__m128i *) pi2_out_res,
512                             _mm_packs_epi32(i4_res_samp_4x32b_r0_1, i4_res_samp_4x32b_r0_2));
513                         _mm_storeu_si128(
514                             (__m128i *) (pi2_out_res + i4_out_res_stride),
515                             _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
516                         _mm_storeu_si128(
517                             (__m128i *) (pi2_out_res + (i4_out_res_stride << 1)),
518                             _mm_packs_epi32(i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2));
519                         _mm_storeu_si128(
520                             (__m128i *) (pi2_out_res + (i4_out_res_stride * 3)),
521                             _mm_packs_epi32(i4_res_samp_4x32b_r3_1, i4_res_samp_4x32b_r3_2));
522                         _mm_storeu_si128(
523                             (__m128i *) (pi2_out_res + (i4_out_res_stride << 2)),
524                             _mm_packs_epi32(i4_res_samp_4x32b_r4_1, i4_res_samp_4x32b_r4_2));
525                         _mm_storeu_si128(
526                             (__m128i *) (pi2_out_res + (i4_out_res_stride * 5)),
527                             _mm_packs_epi32(i4_res_samp_4x32b_r5_1, i4_res_samp_4x32b_r5_2));
528                         _mm_storeu_si128(
529                             (__m128i *) (pi2_out_res + (i4_out_res_stride * 6)),
530                             _mm_packs_epi32(i4_res_samp_4x32b_r6_1, i4_res_samp_4x32b_r6_2));
531                         _mm_storeu_si128(
532                             (__m128i *) (pi2_out_res + (i4_out_res_stride * 7)),
533                             _mm_packs_epi32(i4_res_samp_4x32b_r7_1, i4_res_samp_4x32b_r7_2));
534 
535                         pi2_out_res += BLK8x8SIZE;
536                     }
537                 }
538             }
539             else
540             {
541                 pi2_out_res += BLK8x8SIZE;
542             }
543 
544             /* Block level loop updates */
545             if(1 == i4_blk_ctr)
546             {
547                 pi2_inp_data -= 4;
548                 pi2_inp_data += (i4_inp_data_stride * 4);
549                 pi2_out_res -= MB_SIZE;
550                 pi2_out_res += (i4_out_res_stride * BLK8x8SIZE);
551                 u4_ref_nnz >>= 2;
552             }
553             else
554             {
555                 pi2_inp_data += 4;
556             }
557 
558             u4_ref_nnz >>= 1;
559 
560         } /* end of loop over all the blocks */
561     }
562 }
563 
isvce_get_sad_with_residual_pred_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res,UWORD32 u4_mb_wd,UWORD32 u4_mb_ht)564 UWORD32 isvce_get_sad_with_residual_pred_sse42(buffer_container_t *ps_src,
565                                                buffer_container_t *ps_pred,
566                                                buffer_container_t *ps_res, UWORD32 u4_mb_wd,
567                                                UWORD32 u4_mb_ht)
568 {
569     UWORD32 i, j, u4_sad = 0;
570     UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
571     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
572     WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
573     WORD32 i4_src_stride = ps_src->i4_data_stride;
574     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
575     WORD32 i4_res_stride = ps_res->i4_data_stride;
576     UWORD32 u4_num_rows_per_loop = 8;
577     UWORD32 u4_ht_by_8 = u4_mb_ht / u4_num_rows_per_loop;
578 
579     __m128i src_r0, src_r1, src_r2, src_r3;
580     __m128i src_r4, src_r5, src_r6, src_r7;
581     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
582     __m128i pred_r4, pred_r5, pred_r6, pred_r7;
583     __m128i res_r0, res_r1, res_r2, res_r3;
584     __m128i res_r4, res_r5, res_r6, res_r7;
585     __m128i zero_4x32 = _mm_set1_epi32((WORD32) 0);
586 
587     if((u4_mb_wd == 16) && (u4_mb_ht % 8 == 0))
588     {
589         for(i = 0; i < u4_ht_by_8; i++)
590         {
591             for(j = 0; j < 2; j++)
592             {
593                 src_r0 = _mm_loadl_epi64((__m128i *) (pu1_src));
594                 src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + 8));
595 
596                 pu1_src += i4_src_stride;
597 
598                 src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src));
599                 src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 8));
600 
601                 pu1_src += i4_src_stride;
602 
603                 src_r4 = _mm_loadl_epi64((__m128i *) (pu1_src));
604                 src_r5 = _mm_loadl_epi64((__m128i *) (pu1_src + 8));
605 
606                 pu1_src += i4_src_stride;
607 
608                 src_r6 = _mm_loadl_epi64((__m128i *) (pu1_src));
609                 src_r7 = _mm_loadl_epi64((__m128i *) (pu1_src + 8));
610 
611                 pu1_src += i4_src_stride;
612 
613                 pred_r0 = _mm_loadl_epi64((__m128i *) (pu1_pred));
614                 pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + 8));
615 
616                 pu1_pred += i4_pred_stride;
617 
618                 pred_r2 = _mm_loadl_epi64((__m128i *) (pu1_pred));
619                 pred_r3 = _mm_loadl_epi64((__m128i *) (pu1_pred + 8));
620 
621                 pu1_pred += i4_pred_stride;
622 
623                 pred_r4 = _mm_loadl_epi64((__m128i *) (pu1_pred));
624                 pred_r5 = _mm_loadl_epi64((__m128i *) (pu1_pred + 8));
625 
626                 pu1_pred += i4_pred_stride;
627 
628                 pred_r6 = _mm_loadl_epi64((__m128i *) (pu1_pred));
629                 pred_r7 = _mm_loadl_epi64((__m128i *) (pu1_pred + 8));
630 
631                 pu1_pred += i4_pred_stride;
632 
633                 src_r0 = _mm_cvtepu8_epi16(src_r0);
634                 src_r1 = _mm_cvtepu8_epi16(src_r1);
635                 src_r2 = _mm_cvtepu8_epi16(src_r2);
636                 src_r3 = _mm_cvtepu8_epi16(src_r3);
637                 src_r4 = _mm_cvtepu8_epi16(src_r4);
638                 src_r5 = _mm_cvtepu8_epi16(src_r5);
639                 src_r6 = _mm_cvtepu8_epi16(src_r6);
640                 src_r7 = _mm_cvtepu8_epi16(src_r7);
641 
642                 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
643                 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
644                 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
645                 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
646                 pred_r4 = _mm_cvtepu8_epi16(pred_r4);
647                 pred_r5 = _mm_cvtepu8_epi16(pred_r5);
648                 pred_r6 = _mm_cvtepu8_epi16(pred_r6);
649                 pred_r7 = _mm_cvtepu8_epi16(pred_r7);
650 
651                 res_r0 = _mm_loadu_si128((__m128i *) (pi2_res));
652                 res_r1 = _mm_loadu_si128((__m128i *) (pi2_res + 8));
653 
654                 pi2_res += i4_res_stride;
655 
656                 res_r2 = _mm_loadu_si128((__m128i *) (pi2_res));
657                 res_r3 = _mm_loadu_si128((__m128i *) (pi2_res + 8));
658 
659                 pi2_res += i4_res_stride;
660 
661                 res_r4 = _mm_loadu_si128((__m128i *) (pi2_res));
662                 res_r5 = _mm_loadu_si128((__m128i *) (pi2_res + 8));
663 
664                 pi2_res += i4_res_stride;
665 
666                 res_r6 = _mm_loadu_si128((__m128i *) (pi2_res));
667                 res_r7 = _mm_loadu_si128((__m128i *) (pi2_res + 8));
668 
669                 pi2_res += i4_res_stride;
670 
671                 src_r0 = _mm_sub_epi16(src_r0, pred_r0);
672                 src_r1 = _mm_sub_epi16(src_r1, pred_r1);
673                 src_r2 = _mm_sub_epi16(src_r2, pred_r2);
674                 src_r3 = _mm_sub_epi16(src_r3, pred_r3);
675                 src_r4 = _mm_sub_epi16(src_r4, pred_r4);
676                 src_r5 = _mm_sub_epi16(src_r5, pred_r5);
677                 src_r6 = _mm_sub_epi16(src_r6, pred_r6);
678                 src_r7 = _mm_sub_epi16(src_r7, pred_r7);
679 
680                 src_r0 = _mm_sub_epi16(src_r0, res_r0);
681                 src_r1 = _mm_sub_epi16(src_r1, res_r1);
682                 src_r2 = _mm_sub_epi16(src_r2, res_r2);
683                 src_r3 = _mm_sub_epi16(src_r3, res_r3);
684                 src_r4 = _mm_sub_epi16(src_r4, res_r4);
685                 src_r5 = _mm_sub_epi16(src_r5, res_r5);
686                 src_r6 = _mm_sub_epi16(src_r6, res_r6);
687                 src_r7 = _mm_sub_epi16(src_r7, res_r7);
688 
689                 src_r0 = _mm_abs_epi16(src_r0);
690                 src_r1 = _mm_abs_epi16(src_r1);
691                 src_r2 = _mm_abs_epi16(src_r2);
692                 src_r3 = _mm_abs_epi16(src_r3);
693                 src_r4 = _mm_abs_epi16(src_r4);
694                 src_r5 = _mm_abs_epi16(src_r5);
695                 src_r6 = _mm_abs_epi16(src_r6);
696                 src_r7 = _mm_abs_epi16(src_r7);
697 
698                 src_r0 = _mm_adds_epu16(src_r0, src_r1);
699                 src_r1 = _mm_adds_epu16(src_r2, src_r3);
700                 src_r2 = _mm_adds_epu16(src_r4, src_r5);
701                 src_r3 = _mm_adds_epu16(src_r6, src_r7);
702 
703                 src_r0 = _mm_adds_epu16(src_r0, src_r1);
704                 src_r1 = _mm_adds_epu16(src_r2, src_r3);
705 
706                 src_r0 = _mm_adds_epu16(src_r0, src_r1);
707 
708                 src_r1 = _mm_cvtepu16_epi32(src_r0);
709                 src_r2 = _mm_srli_si128(src_r0, 8);
710                 src_r2 = _mm_cvtepu16_epi32(src_r2);
711 
712                 src_r0 = _mm_hadd_epi32(src_r1, src_r2);
713                 src_r0 = _mm_hadd_epi32(src_r0, zero_4x32);
714                 src_r0 = _mm_hadd_epi32(src_r0, zero_4x32);
715 
716                 u4_sad += _mm_extract_epi32(src_r0, 0);
717             }
718         }
719     }
720     else
721     {
722         for(i = 0; i < u4_mb_ht; i++)
723         {
724             for(j = 0; j < u4_mb_wd; j++)
725             {
726                 WORD16 i2_src = pu1_src[j + i * i4_src_stride];
727                 WORD16 i2_pred = pu1_pred[j + i * i4_pred_stride];
728                 WORD16 i2_res = pi2_res[j + i * i4_res_stride];
729                 u4_sad += ABS(i2_src - i2_pred - i2_res);
730             }
731         }
732     }
733 
734     return u4_sad;
735 }
736