1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 *******************************************************************************
23 *
24 * @file
25 * isvce_residual_pred_sse42.c
26 *
27 * @brief
28 * Contains functions
29 * used for SVC residual
30 * prediction
31 *
32 *******************************************************************************
33 */
34 #include <immintrin.h>
35
36 #include "ih264_typedefs.h"
37 #include "ih264_macros.h"
38 #include "isvc_structs.h"
39
isvce_luma_residual_sampler_2x_sse42(coordinates_t * ps_ref_array_positions,coordinates_t * ps_ref_array_phases,buffer_container_t * ps_inp,buffer_container_t * ps_out,buffer_container_t * ps_scratch,UWORD32 u4_ref_nnz,UWORD8 u1_ref_tx_size)40 void isvce_luma_residual_sampler_2x_sse42(coordinates_t *ps_ref_array_positions,
41 coordinates_t *ps_ref_array_phases,
42 buffer_container_t *ps_inp, buffer_container_t *ps_out,
43 buffer_container_t *ps_scratch, UWORD32 u4_ref_nnz,
44 UWORD8 u1_ref_tx_size)
45 {
46 WORD16 *pi2_inp_data = (WORD16 *) ps_inp->pv_data;
47 WORD16 *pi2_out_res = (WORD16 *) ps_out->pv_data;
48 WORD32 i4_inp_data_stride = ps_inp->i4_data_stride;
49 WORD32 i4_out_res_stride = ps_out->i4_data_stride;
50 WORD16 *pi2_refarray_buffer = (WORD16 *) ps_scratch->pv_data;
51 WORD32 i4_blk_ctr;
52
53 UNUSED(ps_ref_array_positions);
54 UNUSED(ps_ref_array_phases);
55
56 /* For 2x scaling, offsets always point to TL pixel outside MB */
57 /* Hence, refTransBlkIdc will be different and since phase */
58 /* for first refArray pos for horiz filtering samples > 8, */
59 /* first row and first column from the refArray is never used */
60 pi2_inp_data += 1 + i4_inp_data_stride;
61
62 if((u1_ref_tx_size) && (0 != u4_ref_nnz))
63 {
64 WORD16 *pi2_ref_data_byte;
65 WORD32 i4_i, i4_j;
66 WORD16 *pi2_refarray_buffer_tmp = pi2_refarray_buffer;
67
68 __m128i i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1;
69 __m128i res_8x16b_r1_0, res_8x16b_r1_1;
70 __m128i final_res_8x16b_r1_0, final_res_8x16b_r1_1;
71 __m128i coeff_add_8x16b_r1;
72 __m128i coeff_add_8x16b_r2;
73 __m128i i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1;
74 __m128i res_8x16b_r2_0, res_8x16b_r2_1;
75 __m128i final_res_8x16b_r2_0, final_res_8x16b_r2_1;
76
77 pi2_ref_data_byte = pi2_inp_data;
78
79 /* ----------- Horizontal Interpolation ---------------- */
80 for(i4_i = 0; i4_i < BLK8x8SIZE; i4_i += 2)
81 {
82 /* a0 a1 a2 a3 a4 a5 a6 a7 */
83 i2_coeff_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_ref_data_byte);
84 /* b0 b1 b2 b3 b4 b5 b6 b7 */
85 i2_coeff_8x16b_r2_0 =
86 _mm_loadu_si128((__m128i *) (pi2_ref_data_byte + i4_inp_data_stride));
87
88 /* a1 a2 a3 a4 a5 a6 a7 0 */
89 i2_coeff_8x16b_r1_1 = _mm_srli_si128(i2_coeff_8x16b_r1_0, 2);
90 /* b1 b2 b3 b4 b5 b6 b7 0 */
91 i2_coeff_8x16b_r2_1 = _mm_srli_si128(i2_coeff_8x16b_r2_0, 2);
92
93 coeff_add_8x16b_r1 = _mm_add_epi16(i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1);
94 coeff_add_8x16b_r2 = _mm_add_epi16(i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1);
95
96 i2_coeff_8x16b_r1_0 = _mm_slli_epi16(i2_coeff_8x16b_r1_0, 1);
97 i2_coeff_8x16b_r2_0 = _mm_slli_epi16(i2_coeff_8x16b_r2_0, 1);
98
99 i2_coeff_8x16b_r1_1 = _mm_slli_epi16(i2_coeff_8x16b_r1_1, 1);
100 i2_coeff_8x16b_r2_1 = _mm_slli_epi16(i2_coeff_8x16b_r2_1, 1);
101
102 res_8x16b_r1_0 = _mm_add_epi16(i2_coeff_8x16b_r1_0, coeff_add_8x16b_r1);
103 res_8x16b_r2_0 = _mm_add_epi16(i2_coeff_8x16b_r2_0, coeff_add_8x16b_r2);
104
105 res_8x16b_r1_1 = _mm_add_epi16(i2_coeff_8x16b_r1_1, coeff_add_8x16b_r1);
106 res_8x16b_r2_1 = _mm_add_epi16(i2_coeff_8x16b_r2_1, coeff_add_8x16b_r2);
107
108 final_res_8x16b_r1_0 = _mm_unpacklo_epi16(res_8x16b_r1_0, res_8x16b_r1_1);
109 final_res_8x16b_r2_0 = _mm_unpacklo_epi16(res_8x16b_r2_0, res_8x16b_r2_1);
110
111 final_res_8x16b_r1_1 = _mm_unpackhi_epi16(res_8x16b_r1_0, res_8x16b_r1_1);
112 final_res_8x16b_r2_1 = _mm_unpackhi_epi16(res_8x16b_r2_0, res_8x16b_r2_1);
113
114 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 1), final_res_8x16b_r1_0);
115 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 9), final_res_8x16b_r1_1);
116
117 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 17), final_res_8x16b_r2_0);
118 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 25), final_res_8x16b_r2_1);
119
120 pi2_refarray_buffer[0] = (pi2_ref_data_byte[0] << 2);
121 pi2_refarray_buffer[15] = (pi2_ref_data_byte[7] << 2);
122 pi2_ref_data_byte += i4_inp_data_stride;
123 pi2_refarray_buffer[16] = (pi2_ref_data_byte[0] << 2);
124 pi2_refarray_buffer[31] = (pi2_ref_data_byte[7] << 2);
125
126 /* vertical loop updates */
127 pi2_ref_data_byte = pi2_inp_data + ((i4_i + 2) * i4_inp_data_stride);
128 pi2_refarray_buffer += 32;
129 }
130
131 /* ----------- Vertical Interpolation ---------------- */
132 pi2_refarray_buffer = pi2_refarray_buffer_tmp;
133
134 {
135 __m128i i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r1_2, i4_horz_samp_4x32b_r1_3,
136 i4_horz_samp_4x32b_r1_4;
137 __m128i i4_horz_samp_4x32b_r2_1, i4_horz_samp_4x32b_r2_2, i4_horz_samp_4x32b_r2_3,
138 i4_horz_samp_4x32b_r2_4;
139 __m128i i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2, i4_res_samp_4x32b_r1_3,
140 i4_res_samp_4x32b_r1_4;
141 __m128i i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2, i4_res_samp_4x32b_r2_3,
142 i4_res_samp_4x32b_r2_4;
143 __m128i horz_add_4x32b_r2_1, horz_add_4x32b_r2_2, horz_add_4x32b_r2_3,
144 horz_add_4x32b_r2_4;
145
146 __m128i i4_horz_samp_8x16b_r1_1, i4_horz_samp_8x16b_r2_1;
147 __m128i i4_horz_samp_8x16b_r1_2, i4_horz_samp_8x16b_r2_2;
148 __m128i i4_horz_samp_8x16b_r1_3, i4_horz_samp_8x16b_r2_3;
149 __m128i i4_horz_samp_8x16b_r1_4, i4_horz_samp_8x16b_r2_4;
150
151 __m128i twos = _mm_set1_epi32(2);
152 __m128i eights = _mm_set1_epi32(8);
153
154 WORD16 *pi2_out;
155 pi2_out = pi2_out_res;
156
157 i4_horz_samp_8x16b_r1_1 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer));
158 i4_horz_samp_8x16b_r1_2 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 4));
159 i4_horz_samp_8x16b_r1_3 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 8));
160 i4_horz_samp_8x16b_r1_4 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 12));
161
162 i4_horz_samp_4x32b_r1_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_1);
163 i4_horz_samp_4x32b_r1_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_2);
164 i4_horz_samp_4x32b_r1_3 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_3);
165 i4_horz_samp_4x32b_r1_4 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_4);
166
167 /* populate the first inter sample */
168 i4_res_samp_4x32b_r1_1 =
169 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_1, twos), 2);
170 i4_res_samp_4x32b_r1_2 =
171 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_2, twos), 2);
172 i4_res_samp_4x32b_r1_3 =
173 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_3, twos), 2);
174 i4_res_samp_4x32b_r1_4 =
175 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_4, twos), 2);
176
177 _mm_storeu_si128((__m128i *) pi2_out,
178 _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
179 _mm_storeu_si128((__m128i *) (pi2_out + 8),
180 _mm_packs_epi32(i4_res_samp_4x32b_r1_3, i4_res_samp_4x32b_r1_4));
181 pi2_out += i4_out_res_stride;
182
183 for(i4_j = 0; i4_j < 14; i4_j += 2)
184 {
185 pi2_refarray_buffer += MB_SIZE;
186
187 i4_horz_samp_8x16b_r2_1 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer));
188 i4_horz_samp_8x16b_r2_2 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 4));
189 i4_horz_samp_8x16b_r2_3 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 8));
190 i4_horz_samp_8x16b_r2_4 = _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 12));
191
192 i4_horz_samp_4x32b_r2_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_1);
193 i4_horz_samp_4x32b_r2_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_2);
194 i4_horz_samp_4x32b_r2_3 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_3);
195 i4_horz_samp_4x32b_r2_4 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_4);
196
197 horz_add_4x32b_r2_1 =
198 _mm_add_epi32(i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r2_1);
199 horz_add_4x32b_r2_2 =
200 _mm_add_epi32(i4_horz_samp_4x32b_r1_2, i4_horz_samp_4x32b_r2_2);
201 horz_add_4x32b_r2_3 =
202 _mm_add_epi32(i4_horz_samp_4x32b_r1_3, i4_horz_samp_4x32b_r2_3);
203 horz_add_4x32b_r2_4 =
204 _mm_add_epi32(i4_horz_samp_4x32b_r1_4, i4_horz_samp_4x32b_r2_4);
205
206 i4_res_samp_4x32b_r1_1 =
207 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_1, 1), horz_add_4x32b_r2_1);
208 i4_res_samp_4x32b_r1_2 =
209 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_2, 1), horz_add_4x32b_r2_2);
210 i4_res_samp_4x32b_r1_3 =
211 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_3, 1), horz_add_4x32b_r2_3);
212 i4_res_samp_4x32b_r1_4 =
213 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r1_4, 1), horz_add_4x32b_r2_4);
214
215 i4_res_samp_4x32b_r2_1 =
216 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_1, 1), horz_add_4x32b_r2_1);
217 i4_res_samp_4x32b_r2_2 =
218 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_2, 1), horz_add_4x32b_r2_2);
219 i4_res_samp_4x32b_r2_3 =
220 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_3, 1), horz_add_4x32b_r2_3);
221 i4_res_samp_4x32b_r2_4 =
222 _mm_add_epi32(_mm_slli_epi32(i4_horz_samp_4x32b_r2_4, 1), horz_add_4x32b_r2_4);
223
224 i4_res_samp_4x32b_r1_1 =
225 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_1, eights), 4);
226 i4_res_samp_4x32b_r1_2 =
227 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_2, eights), 4);
228 i4_res_samp_4x32b_r1_3 =
229 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_3, eights), 4);
230 i4_res_samp_4x32b_r1_4 =
231 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_4, eights), 4);
232
233 i4_res_samp_4x32b_r2_1 =
234 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_1, eights), 4);
235 i4_res_samp_4x32b_r2_2 =
236 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_2, eights), 4);
237 i4_res_samp_4x32b_r2_3 =
238 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_3, eights), 4);
239 i4_res_samp_4x32b_r2_4 =
240 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_4, eights), 4);
241
242 /* populate 2 samples based on current coeffs */
243 _mm_storeu_si128((__m128i *) pi2_out,
244 _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
245 _mm_storeu_si128((__m128i *) (pi2_out + 8),
246 _mm_packs_epi32(i4_res_samp_4x32b_r1_3, i4_res_samp_4x32b_r1_4));
247 pi2_out += i4_out_res_stride;
248
249 _mm_storeu_si128((__m128i *) pi2_out,
250 _mm_packs_epi32(i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2));
251 _mm_storeu_si128((__m128i *) (pi2_out + 8),
252 _mm_packs_epi32(i4_res_samp_4x32b_r2_3, i4_res_samp_4x32b_r2_4));
253 pi2_out += i4_out_res_stride;
254
255 /* store the coeff 2 to coeff 1 */
256 /* (used in next iteration) */
257 i4_horz_samp_4x32b_r1_1 = i4_horz_samp_4x32b_r2_1;
258 i4_horz_samp_4x32b_r1_2 = i4_horz_samp_4x32b_r2_2;
259 i4_horz_samp_4x32b_r1_3 = i4_horz_samp_4x32b_r2_3;
260 i4_horz_samp_4x32b_r1_4 = i4_horz_samp_4x32b_r2_4;
261 }
262
263 i4_res_samp_4x32b_r1_1 =
264 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_1, twos), 2);
265 i4_res_samp_4x32b_r1_2 =
266 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_2, twos), 2);
267 i4_res_samp_4x32b_r1_3 =
268 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_3, twos), 2);
269 i4_res_samp_4x32b_r1_4 =
270 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r1_4, twos), 2);
271
272 _mm_storeu_si128((__m128i *) pi2_out,
273 _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
274 _mm_storeu_si128((__m128i *) (pi2_out + 8),
275 _mm_packs_epi32(i4_res_samp_4x32b_r1_3, i4_res_samp_4x32b_r1_4));
276 }
277 }
278 else
279 {
280 /* ----------------------------------------------------------------- */
281 /* LOOP over number of blocks */
282 /* ----------------------------------------------------------------- */
283 for(i4_blk_ctr = 0; i4_blk_ctr < 4; i4_blk_ctr++)
284 {
285 /* if reference layer is not coded then no processing */
286 if(0 != (u4_ref_nnz & 0x1))
287 {
288 {
289 __m128i i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1;
290 __m128i i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1;
291 __m128i i2_coeff_8x16b_r3_0, i2_coeff_8x16b_r3_1;
292 __m128i i2_coeff_8x16b_r4_0, i2_coeff_8x16b_r4_1;
293
294 __m128i res_8x16b_r1_0, res_8x16b_r1_1;
295 __m128i res_8x16b_r2_0, res_8x16b_r2_1;
296 __m128i res_8x16b_r3_0, res_8x16b_r3_1;
297 __m128i res_8x16b_r4_0, res_8x16b_r4_1;
298 __m128i final_res_8x16b_r1_0;
299 __m128i final_res_8x16b_r2_0;
300 __m128i final_res_8x16b_r3_0;
301 __m128i final_res_8x16b_r4_0;
302
303 __m128i coeff_add_8x16b_r1;
304 __m128i coeff_add_8x16b_r2;
305 __m128i coeff_add_8x16b_r3;
306 __m128i coeff_add_8x16b_r4;
307
308 /* ----------- Horizontal Interpolation ---------------- */
309 {
310 /* a0 a1 a2 a3 a4 a5 a6 a7 */
311 i2_coeff_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_inp_data);
312 /* b0 b1 b2 b3 b4 b5 b6 b7 */
313 i2_coeff_8x16b_r2_0 =
314 _mm_loadu_si128((__m128i *) (pi2_inp_data + i4_inp_data_stride));
315 i2_coeff_8x16b_r3_0 =
316 _mm_loadu_si128((__m128i *) (pi2_inp_data + (i4_inp_data_stride << 1)));
317 i2_coeff_8x16b_r4_0 =
318 _mm_loadu_si128((__m128i *) (pi2_inp_data + (i4_inp_data_stride * 3)));
319
320 /* a1 a2 a3 a4 a5 a6 a7 0 */
321 i2_coeff_8x16b_r1_1 = _mm_srli_si128(i2_coeff_8x16b_r1_0, 2);
322 /* b1 b2 b3 b4 b5 b6 b7 0 */
323 i2_coeff_8x16b_r2_1 = _mm_srli_si128(i2_coeff_8x16b_r2_0, 2);
324 i2_coeff_8x16b_r3_1 = _mm_srli_si128(i2_coeff_8x16b_r3_0, 2);
325 i2_coeff_8x16b_r4_1 = _mm_srli_si128(i2_coeff_8x16b_r4_0, 2);
326
327 coeff_add_8x16b_r1 =
328 _mm_add_epi16(i2_coeff_8x16b_r1_0, i2_coeff_8x16b_r1_1);
329 coeff_add_8x16b_r2 =
330 _mm_add_epi16(i2_coeff_8x16b_r2_0, i2_coeff_8x16b_r2_1);
331 coeff_add_8x16b_r3 =
332 _mm_add_epi16(i2_coeff_8x16b_r3_0, i2_coeff_8x16b_r3_1);
333 coeff_add_8x16b_r4 =
334 _mm_add_epi16(i2_coeff_8x16b_r4_0, i2_coeff_8x16b_r4_1);
335
336 i2_coeff_8x16b_r1_0 = _mm_slli_epi16(i2_coeff_8x16b_r1_0, 1);
337 i2_coeff_8x16b_r2_0 = _mm_slli_epi16(i2_coeff_8x16b_r2_0, 1);
338 i2_coeff_8x16b_r3_0 = _mm_slli_epi16(i2_coeff_8x16b_r3_0, 1);
339 i2_coeff_8x16b_r4_0 = _mm_slli_epi16(i2_coeff_8x16b_r4_0, 1);
340
341 i2_coeff_8x16b_r1_1 = _mm_slli_epi16(i2_coeff_8x16b_r1_1, 1);
342 i2_coeff_8x16b_r2_1 = _mm_slli_epi16(i2_coeff_8x16b_r2_1, 1);
343 i2_coeff_8x16b_r3_1 = _mm_slli_epi16(i2_coeff_8x16b_r3_1, 1);
344 i2_coeff_8x16b_r4_1 = _mm_slli_epi16(i2_coeff_8x16b_r4_1, 1);
345
346 res_8x16b_r1_0 = _mm_add_epi16(i2_coeff_8x16b_r1_0, coeff_add_8x16b_r1);
347 res_8x16b_r2_0 = _mm_add_epi16(i2_coeff_8x16b_r2_0, coeff_add_8x16b_r2);
348 res_8x16b_r3_0 = _mm_add_epi16(i2_coeff_8x16b_r3_0, coeff_add_8x16b_r3);
349 res_8x16b_r4_0 = _mm_add_epi16(i2_coeff_8x16b_r4_0, coeff_add_8x16b_r4);
350
351 res_8x16b_r1_1 = _mm_add_epi16(i2_coeff_8x16b_r1_1, coeff_add_8x16b_r1);
352 res_8x16b_r2_1 = _mm_add_epi16(i2_coeff_8x16b_r2_1, coeff_add_8x16b_r2);
353 res_8x16b_r3_1 = _mm_add_epi16(i2_coeff_8x16b_r3_1, coeff_add_8x16b_r3);
354 res_8x16b_r4_1 = _mm_add_epi16(i2_coeff_8x16b_r4_1, coeff_add_8x16b_r4);
355
356 final_res_8x16b_r1_0 = _mm_unpacklo_epi16(res_8x16b_r1_0, res_8x16b_r1_1);
357 final_res_8x16b_r2_0 = _mm_unpacklo_epi16(res_8x16b_r2_0, res_8x16b_r2_1);
358 final_res_8x16b_r3_0 = _mm_unpacklo_epi16(res_8x16b_r3_0, res_8x16b_r3_1);
359 final_res_8x16b_r4_0 = _mm_unpacklo_epi16(res_8x16b_r4_0, res_8x16b_r4_1);
360
361 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 1),
362 final_res_8x16b_r1_0);
363 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 9),
364 final_res_8x16b_r2_0);
365 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 17),
366 final_res_8x16b_r3_0);
367 _mm_storeu_si128((__m128i *) (pi2_refarray_buffer + 25),
368 final_res_8x16b_r4_0);
369
370 pi2_refarray_buffer[0] = (pi2_inp_data[0] << 2);
371 pi2_refarray_buffer[7] = (pi2_inp_data[3] << 2);
372 pi2_refarray_buffer[8] = (pi2_inp_data[i4_inp_data_stride] << 2);
373 pi2_refarray_buffer[15] = (pi2_inp_data[i4_inp_data_stride + 3] << 2);
374 pi2_refarray_buffer[16] = (pi2_inp_data[(i4_inp_data_stride << 1)] << 2);
375 pi2_refarray_buffer[23] =
376 (pi2_inp_data[(i4_inp_data_stride << 1) + 3] << 2);
377 pi2_refarray_buffer[24] = (pi2_inp_data[(i4_inp_data_stride * 3)] << 2);
378 pi2_refarray_buffer[31] = (pi2_inp_data[(i4_inp_data_stride * 3) + 3] << 2);
379 }
380
381 /* ----------- Vertical Interpolation ---------------- */
382 {
383 __m128i i4_horz_samp_8x16b_r0_1, i4_horz_samp_8x16b_r0_2;
384 __m128i i4_horz_samp_8x16b_r1_1, i4_horz_samp_8x16b_r1_2;
385 __m128i i4_horz_samp_8x16b_r2_1, i4_horz_samp_8x16b_r2_2;
386 __m128i i4_horz_samp_8x16b_r3_1, i4_horz_samp_8x16b_r3_2;
387
388 __m128i i4_horz_samp_4x32b_r0_1, i4_horz_samp_4x32b_r0_2;
389 __m128i i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r1_2;
390 __m128i i4_horz_samp_4x32b_r2_1, i4_horz_samp_4x32b_r2_2;
391 __m128i i4_horz_samp_4x32b_r3_1, i4_horz_samp_4x32b_r3_2;
392
393 __m128i i4_res_samp_4x32b_r0_1, i4_res_samp_4x32b_r0_2;
394 __m128i i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2;
395 __m128i i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2;
396 __m128i i4_res_samp_4x32b_r3_1, i4_res_samp_4x32b_r3_2;
397 __m128i i4_res_samp_4x32b_r4_1, i4_res_samp_4x32b_r4_2;
398 __m128i i4_res_samp_4x32b_r5_1, i4_res_samp_4x32b_r5_2;
399 __m128i i4_res_samp_4x32b_r6_1, i4_res_samp_4x32b_r6_2;
400 __m128i i4_res_samp_4x32b_r7_1, i4_res_samp_4x32b_r7_2;
401
402 __m128i horz_add_4x32b_r1_1, horz_add_4x32b_r1_2;
403 __m128i horz_add_4x32b_r2_1, horz_add_4x32b_r2_2;
404 __m128i horz_add_4x32b_r3_1, horz_add_4x32b_r3_2;
405
406 __m128i twos = _mm_set1_epi32(2);
407 __m128i eights = _mm_set1_epi32(8);
408
409 i4_horz_samp_8x16b_r0_1 =
410 _mm_loadu_si128((__m128i *) (pi2_refarray_buffer));
411 i4_horz_samp_8x16b_r0_2 =
412 _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + 4));
413 i4_horz_samp_8x16b_r1_1 =
414 _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + BLK8x8SIZE));
415 i4_horz_samp_8x16b_r1_2 =
416 _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + BLK8x8SIZE + 4));
417 i4_horz_samp_8x16b_r2_1 =
418 _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + (BLK8x8SIZE << 1)));
419 i4_horz_samp_8x16b_r2_2 = _mm_loadu_si128(
420 (__m128i *) (pi2_refarray_buffer + (BLK8x8SIZE << 1) + 4));
421 i4_horz_samp_8x16b_r3_1 =
422 _mm_loadu_si128((__m128i *) (pi2_refarray_buffer + (BLK8x8SIZE * 3)));
423 i4_horz_samp_8x16b_r3_2 = _mm_loadu_si128(
424 (__m128i *) (pi2_refarray_buffer + (BLK8x8SIZE * 3) + 4));
425
426 i4_horz_samp_4x32b_r0_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r0_1);
427 i4_horz_samp_4x32b_r0_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r0_2);
428 i4_horz_samp_4x32b_r1_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_1);
429 i4_horz_samp_4x32b_r1_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r1_2);
430 i4_horz_samp_4x32b_r2_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_1);
431 i4_horz_samp_4x32b_r2_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r2_2);
432 i4_horz_samp_4x32b_r3_1 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r3_1);
433 i4_horz_samp_4x32b_r3_2 = _mm_cvtepi16_epi32(i4_horz_samp_8x16b_r3_2);
434
435 horz_add_4x32b_r1_1 =
436 _mm_add_epi32(i4_horz_samp_4x32b_r0_1, i4_horz_samp_4x32b_r1_1);
437 horz_add_4x32b_r2_1 =
438 _mm_add_epi32(i4_horz_samp_4x32b_r1_1, i4_horz_samp_4x32b_r2_1);
439 horz_add_4x32b_r3_1 =
440 _mm_add_epi32(i4_horz_samp_4x32b_r2_1, i4_horz_samp_4x32b_r3_1);
441
442 horz_add_4x32b_r1_2 =
443 _mm_add_epi32(i4_horz_samp_4x32b_r0_2, i4_horz_samp_4x32b_r1_2);
444 horz_add_4x32b_r2_2 =
445 _mm_add_epi32(i4_horz_samp_4x32b_r1_2, i4_horz_samp_4x32b_r2_2);
446 horz_add_4x32b_r3_2 =
447 _mm_add_epi32(i4_horz_samp_4x32b_r2_2, i4_horz_samp_4x32b_r3_2);
448
449 i4_res_samp_4x32b_r1_1 = _mm_add_epi32(
450 _mm_slli_epi32(i4_horz_samp_4x32b_r0_1, 1), horz_add_4x32b_r1_1);
451 i4_res_samp_4x32b_r2_1 = _mm_add_epi32(
452 _mm_slli_epi32(i4_horz_samp_4x32b_r1_1, 1), horz_add_4x32b_r1_1);
453 i4_res_samp_4x32b_r3_1 = _mm_add_epi32(
454 _mm_slli_epi32(i4_horz_samp_4x32b_r1_1, 1), horz_add_4x32b_r2_1);
455 i4_res_samp_4x32b_r4_1 = _mm_add_epi32(
456 _mm_slli_epi32(i4_horz_samp_4x32b_r2_1, 1), horz_add_4x32b_r2_1);
457 i4_res_samp_4x32b_r5_1 = _mm_add_epi32(
458 _mm_slli_epi32(i4_horz_samp_4x32b_r2_1, 1), horz_add_4x32b_r3_1);
459 i4_res_samp_4x32b_r6_1 = _mm_add_epi32(
460 _mm_slli_epi32(i4_horz_samp_4x32b_r3_1, 1), horz_add_4x32b_r3_1);
461
462 i4_res_samp_4x32b_r1_2 = _mm_add_epi32(
463 _mm_slli_epi32(i4_horz_samp_4x32b_r0_2, 1), horz_add_4x32b_r1_2);
464 i4_res_samp_4x32b_r2_2 = _mm_add_epi32(
465 _mm_slli_epi32(i4_horz_samp_4x32b_r1_2, 1), horz_add_4x32b_r1_2);
466 i4_res_samp_4x32b_r3_2 = _mm_add_epi32(
467 _mm_slli_epi32(i4_horz_samp_4x32b_r1_2, 1), horz_add_4x32b_r2_2);
468 i4_res_samp_4x32b_r4_2 = _mm_add_epi32(
469 _mm_slli_epi32(i4_horz_samp_4x32b_r2_2, 1), horz_add_4x32b_r2_2);
470 i4_res_samp_4x32b_r5_2 = _mm_add_epi32(
471 _mm_slli_epi32(i4_horz_samp_4x32b_r2_2, 1), horz_add_4x32b_r3_2);
472 i4_res_samp_4x32b_r6_2 = _mm_add_epi32(
473 _mm_slli_epi32(i4_horz_samp_4x32b_r3_2, 1), horz_add_4x32b_r3_2);
474
475 i4_res_samp_4x32b_r0_1 =
476 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r0_1, twos), 2);
477 i4_res_samp_4x32b_r1_1 =
478 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_1, eights), 4);
479 i4_res_samp_4x32b_r2_1 =
480 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_1, eights), 4);
481 i4_res_samp_4x32b_r3_1 =
482 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r3_1, eights), 4);
483 i4_res_samp_4x32b_r4_1 =
484 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r4_1, eights), 4);
485 i4_res_samp_4x32b_r5_1 =
486 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r5_1, eights), 4);
487 i4_res_samp_4x32b_r6_1 =
488 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r6_1, eights), 4);
489 i4_res_samp_4x32b_r7_1 =
490 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r3_1, twos), 2);
491
492 i4_res_samp_4x32b_r0_2 =
493 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r0_2, twos), 2);
494 i4_res_samp_4x32b_r1_2 =
495 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r1_2, eights), 4);
496 i4_res_samp_4x32b_r2_2 =
497 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r2_2, eights), 4);
498 i4_res_samp_4x32b_r3_2 =
499 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r3_2, eights), 4);
500 i4_res_samp_4x32b_r4_2 =
501 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r4_2, eights), 4);
502 i4_res_samp_4x32b_r5_2 =
503 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r5_2, eights), 4);
504 i4_res_samp_4x32b_r6_2 =
505 _mm_srai_epi32(_mm_add_epi32(i4_res_samp_4x32b_r6_2, eights), 4);
506 i4_res_samp_4x32b_r7_2 =
507 _mm_srai_epi32(_mm_add_epi32(i4_horz_samp_4x32b_r3_2, twos), 2);
508
509 /* populate 2 samples based on current coeffs */
510 _mm_storeu_si128(
511 (__m128i *) pi2_out_res,
512 _mm_packs_epi32(i4_res_samp_4x32b_r0_1, i4_res_samp_4x32b_r0_2));
513 _mm_storeu_si128(
514 (__m128i *) (pi2_out_res + i4_out_res_stride),
515 _mm_packs_epi32(i4_res_samp_4x32b_r1_1, i4_res_samp_4x32b_r1_2));
516 _mm_storeu_si128(
517 (__m128i *) (pi2_out_res + (i4_out_res_stride << 1)),
518 _mm_packs_epi32(i4_res_samp_4x32b_r2_1, i4_res_samp_4x32b_r2_2));
519 _mm_storeu_si128(
520 (__m128i *) (pi2_out_res + (i4_out_res_stride * 3)),
521 _mm_packs_epi32(i4_res_samp_4x32b_r3_1, i4_res_samp_4x32b_r3_2));
522 _mm_storeu_si128(
523 (__m128i *) (pi2_out_res + (i4_out_res_stride << 2)),
524 _mm_packs_epi32(i4_res_samp_4x32b_r4_1, i4_res_samp_4x32b_r4_2));
525 _mm_storeu_si128(
526 (__m128i *) (pi2_out_res + (i4_out_res_stride * 5)),
527 _mm_packs_epi32(i4_res_samp_4x32b_r5_1, i4_res_samp_4x32b_r5_2));
528 _mm_storeu_si128(
529 (__m128i *) (pi2_out_res + (i4_out_res_stride * 6)),
530 _mm_packs_epi32(i4_res_samp_4x32b_r6_1, i4_res_samp_4x32b_r6_2));
531 _mm_storeu_si128(
532 (__m128i *) (pi2_out_res + (i4_out_res_stride * 7)),
533 _mm_packs_epi32(i4_res_samp_4x32b_r7_1, i4_res_samp_4x32b_r7_2));
534
535 pi2_out_res += BLK8x8SIZE;
536 }
537 }
538 }
539 else
540 {
541 pi2_out_res += BLK8x8SIZE;
542 }
543
544 /* Block level loop updates */
545 if(1 == i4_blk_ctr)
546 {
547 pi2_inp_data -= 4;
548 pi2_inp_data += (i4_inp_data_stride * 4);
549 pi2_out_res -= MB_SIZE;
550 pi2_out_res += (i4_out_res_stride * BLK8x8SIZE);
551 u4_ref_nnz >>= 2;
552 }
553 else
554 {
555 pi2_inp_data += 4;
556 }
557
558 u4_ref_nnz >>= 1;
559
560 } /* end of loop over all the blocks */
561 }
562 }
563
isvce_get_sad_with_residual_pred_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_res,UWORD32 u4_mb_wd,UWORD32 u4_mb_ht)564 UWORD32 isvce_get_sad_with_residual_pred_sse42(buffer_container_t *ps_src,
565 buffer_container_t *ps_pred,
566 buffer_container_t *ps_res, UWORD32 u4_mb_wd,
567 UWORD32 u4_mb_ht)
568 {
569 UWORD32 i, j, u4_sad = 0;
570 UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
571 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
572 WORD16 *pi2_res = (WORD16 *) ps_res->pv_data;
573 WORD32 i4_src_stride = ps_src->i4_data_stride;
574 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
575 WORD32 i4_res_stride = ps_res->i4_data_stride;
576 UWORD32 u4_num_rows_per_loop = 8;
577 UWORD32 u4_ht_by_8 = u4_mb_ht / u4_num_rows_per_loop;
578
579 __m128i src_r0, src_r1, src_r2, src_r3;
580 __m128i src_r4, src_r5, src_r6, src_r7;
581 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
582 __m128i pred_r4, pred_r5, pred_r6, pred_r7;
583 __m128i res_r0, res_r1, res_r2, res_r3;
584 __m128i res_r4, res_r5, res_r6, res_r7;
585 __m128i zero_4x32 = _mm_set1_epi32((WORD32) 0);
586
587 if((u4_mb_wd == 16) && (u4_mb_ht % 8 == 0))
588 {
589 for(i = 0; i < u4_ht_by_8; i++)
590 {
591 for(j = 0; j < 2; j++)
592 {
593 src_r0 = _mm_loadl_epi64((__m128i *) (pu1_src));
594 src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + 8));
595
596 pu1_src += i4_src_stride;
597
598 src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src));
599 src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 8));
600
601 pu1_src += i4_src_stride;
602
603 src_r4 = _mm_loadl_epi64((__m128i *) (pu1_src));
604 src_r5 = _mm_loadl_epi64((__m128i *) (pu1_src + 8));
605
606 pu1_src += i4_src_stride;
607
608 src_r6 = _mm_loadl_epi64((__m128i *) (pu1_src));
609 src_r7 = _mm_loadl_epi64((__m128i *) (pu1_src + 8));
610
611 pu1_src += i4_src_stride;
612
613 pred_r0 = _mm_loadl_epi64((__m128i *) (pu1_pred));
614 pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + 8));
615
616 pu1_pred += i4_pred_stride;
617
618 pred_r2 = _mm_loadl_epi64((__m128i *) (pu1_pred));
619 pred_r3 = _mm_loadl_epi64((__m128i *) (pu1_pred + 8));
620
621 pu1_pred += i4_pred_stride;
622
623 pred_r4 = _mm_loadl_epi64((__m128i *) (pu1_pred));
624 pred_r5 = _mm_loadl_epi64((__m128i *) (pu1_pred + 8));
625
626 pu1_pred += i4_pred_stride;
627
628 pred_r6 = _mm_loadl_epi64((__m128i *) (pu1_pred));
629 pred_r7 = _mm_loadl_epi64((__m128i *) (pu1_pred + 8));
630
631 pu1_pred += i4_pred_stride;
632
633 src_r0 = _mm_cvtepu8_epi16(src_r0);
634 src_r1 = _mm_cvtepu8_epi16(src_r1);
635 src_r2 = _mm_cvtepu8_epi16(src_r2);
636 src_r3 = _mm_cvtepu8_epi16(src_r3);
637 src_r4 = _mm_cvtepu8_epi16(src_r4);
638 src_r5 = _mm_cvtepu8_epi16(src_r5);
639 src_r6 = _mm_cvtepu8_epi16(src_r6);
640 src_r7 = _mm_cvtepu8_epi16(src_r7);
641
642 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
643 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
644 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
645 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
646 pred_r4 = _mm_cvtepu8_epi16(pred_r4);
647 pred_r5 = _mm_cvtepu8_epi16(pred_r5);
648 pred_r6 = _mm_cvtepu8_epi16(pred_r6);
649 pred_r7 = _mm_cvtepu8_epi16(pred_r7);
650
651 res_r0 = _mm_loadu_si128((__m128i *) (pi2_res));
652 res_r1 = _mm_loadu_si128((__m128i *) (pi2_res + 8));
653
654 pi2_res += i4_res_stride;
655
656 res_r2 = _mm_loadu_si128((__m128i *) (pi2_res));
657 res_r3 = _mm_loadu_si128((__m128i *) (pi2_res + 8));
658
659 pi2_res += i4_res_stride;
660
661 res_r4 = _mm_loadu_si128((__m128i *) (pi2_res));
662 res_r5 = _mm_loadu_si128((__m128i *) (pi2_res + 8));
663
664 pi2_res += i4_res_stride;
665
666 res_r6 = _mm_loadu_si128((__m128i *) (pi2_res));
667 res_r7 = _mm_loadu_si128((__m128i *) (pi2_res + 8));
668
669 pi2_res += i4_res_stride;
670
671 src_r0 = _mm_sub_epi16(src_r0, pred_r0);
672 src_r1 = _mm_sub_epi16(src_r1, pred_r1);
673 src_r2 = _mm_sub_epi16(src_r2, pred_r2);
674 src_r3 = _mm_sub_epi16(src_r3, pred_r3);
675 src_r4 = _mm_sub_epi16(src_r4, pred_r4);
676 src_r5 = _mm_sub_epi16(src_r5, pred_r5);
677 src_r6 = _mm_sub_epi16(src_r6, pred_r6);
678 src_r7 = _mm_sub_epi16(src_r7, pred_r7);
679
680 src_r0 = _mm_sub_epi16(src_r0, res_r0);
681 src_r1 = _mm_sub_epi16(src_r1, res_r1);
682 src_r2 = _mm_sub_epi16(src_r2, res_r2);
683 src_r3 = _mm_sub_epi16(src_r3, res_r3);
684 src_r4 = _mm_sub_epi16(src_r4, res_r4);
685 src_r5 = _mm_sub_epi16(src_r5, res_r5);
686 src_r6 = _mm_sub_epi16(src_r6, res_r6);
687 src_r7 = _mm_sub_epi16(src_r7, res_r7);
688
689 src_r0 = _mm_abs_epi16(src_r0);
690 src_r1 = _mm_abs_epi16(src_r1);
691 src_r2 = _mm_abs_epi16(src_r2);
692 src_r3 = _mm_abs_epi16(src_r3);
693 src_r4 = _mm_abs_epi16(src_r4);
694 src_r5 = _mm_abs_epi16(src_r5);
695 src_r6 = _mm_abs_epi16(src_r6);
696 src_r7 = _mm_abs_epi16(src_r7);
697
698 src_r0 = _mm_adds_epu16(src_r0, src_r1);
699 src_r1 = _mm_adds_epu16(src_r2, src_r3);
700 src_r2 = _mm_adds_epu16(src_r4, src_r5);
701 src_r3 = _mm_adds_epu16(src_r6, src_r7);
702
703 src_r0 = _mm_adds_epu16(src_r0, src_r1);
704 src_r1 = _mm_adds_epu16(src_r2, src_r3);
705
706 src_r0 = _mm_adds_epu16(src_r0, src_r1);
707
708 src_r1 = _mm_cvtepu16_epi32(src_r0);
709 src_r2 = _mm_srli_si128(src_r0, 8);
710 src_r2 = _mm_cvtepu16_epi32(src_r2);
711
712 src_r0 = _mm_hadd_epi32(src_r1, src_r2);
713 src_r0 = _mm_hadd_epi32(src_r0, zero_4x32);
714 src_r0 = _mm_hadd_epi32(src_r0, zero_4x32);
715
716 u4_sad += _mm_extract_epi32(src_r0, 0);
717 }
718 }
719 }
720 else
721 {
722 for(i = 0; i < u4_mb_ht; i++)
723 {
724 for(j = 0; j < u4_mb_wd; j++)
725 {
726 WORD16 i2_src = pu1_src[j + i * i4_src_stride];
727 WORD16 i2_pred = pu1_pred[j + i * i4_pred_stride];
728 WORD16 i2_res = pi2_res[j + i * i4_res_stride];
729 u4_sad += ABS(i2_src - i2_pred - i2_res);
730 }
731 }
732 }
733
734 return u4_sad;
735 }
736