1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * isvc_mem_fns_atom_intr.c
24 *
25 * @brief
26 * Functions used for memory operations
27 *
28 * @author
29 * Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 * None
35 *
36 *******************************************************************************
37 */
38
39 /*****************************************************************************/
40 /* File Includes */
41 /*****************************************************************************/
42 #include <stdio.h>
43 #include <stddef.h>
44 #include <stdlib.h>
45 #include <string.h>
46 #include <assert.h>
47
48 #include "ih264_typedefs.h"
49 #include "isvc_mem_fns.h"
50
51 #include <immintrin.h>
52
53 /**
54 ********************************************************************************
55 * @brief copies a 2d blk from one location to another
56 *
57 * @param[out] pu1_dst : dst pointer
58 *
59 * @param[in] i4_dst_stride: stride of destination
60 *
61 * @param[in] pu1_src : src ptr
62 *
63 * @param[in] i4_src_stride: stride of src
64 *
65 * @param[in] i4_blk_wd : blk width
66 *
67 * @param[in] i4_blk_ht : blk height
68 *
69 * @return void
70 ********************************************************************************
71 */
isvc_copy_2d_ssse3(UWORD8 * pu1_dst,WORD32 i4_dst_stride,UWORD8 * pu1_src,WORD32 i4_src_stride,WORD32 i4_blk_wd,WORD32 i4_blk_ht)72 void isvc_copy_2d_ssse3(UWORD8 *pu1_dst, WORD32 i4_dst_stride, UWORD8 *pu1_src,
73 WORD32 i4_src_stride, WORD32 i4_blk_wd, WORD32 i4_blk_ht)
74 {
75 WORD32 i, j;
76 /* all 128 bit registers are named with a suffix mxnb, where m is the */
77 /* number of n bits packed in the register */
78
79 if(((i4_blk_wd % 4) != 0) || ((i4_blk_ht % 4) != 0))
80 {
81 isvc_copy_2d(pu1_dst, i4_dst_stride, pu1_src, i4_src_stride, i4_blk_wd, i4_blk_ht);
82
83 return;
84 }
85
86 if(0 == (i4_blk_wd & 31)) /* wd multiple of 32 case */
87 {
88 __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
89 __m128i src4_16x8b, src5_16x8b, src6_16x8b, src7_16x8b;
90
91 if(0 == (i4_blk_ht & 7)) /* ht multiple of 8 case */
92 {
93 __m128i src8_16x8b, src9_16x8b, src10_16x8b, src11_16x8b;
94 __m128i src12_16x8b, src13_16x8b, src14_16x8b, src15_16x8b;
95
96 for(i = 0; i < i4_blk_ht; i += 8)
97 {
98 for(j = 0; j < i4_blk_wd; j += 32)
99 {
100 src0_16x8b =
101 _mm_loadu_si128((__m128i *) (pu1_src)); // i = 0
102 src1_16x8b =
103 _mm_loadu_si128((__m128i *) (pu1_src + i4_src_stride)); // i = 1
104 src2_16x8b =
105 _mm_loadu_si128((__m128i *) (pu1_src + 2 * i4_src_stride)); // i = 2
106 src3_16x8b =
107 _mm_loadu_si128((__m128i *) (pu1_src + 3 * i4_src_stride)); // i = 3
108 src4_16x8b =
109 _mm_loadu_si128((__m128i *) (pu1_src + 4 * i4_src_stride)); // i = 4
110 src5_16x8b =
111 _mm_loadu_si128((__m128i *) (pu1_src + 5 * i4_src_stride)); // i = 5
112 src6_16x8b =
113 _mm_loadu_si128((__m128i *) (pu1_src + 6 * i4_src_stride)); // i = 6
114 src7_16x8b =
115 _mm_loadu_si128((__m128i *) (pu1_src + 7 * i4_src_stride)); // i = 7
116 /* Add 16 as offset */
117 src8_16x8b =
118 _mm_loadu_si128((__m128i *) (pu1_src + 16)); // i = 0
119 src9_16x8b =
120 _mm_loadu_si128((__m128i *) (pu1_src + i4_src_stride + 16)); // i = 1
121 src10_16x8b =
122 _mm_loadu_si128((__m128i *) (pu1_src + 2 * i4_src_stride + 16)); // i = 2
123 src11_16x8b =
124 _mm_loadu_si128((__m128i *) (pu1_src + 3 * i4_src_stride + 16)); // i = 3
125 src12_16x8b =
126 _mm_loadu_si128((__m128i *) (pu1_src + 4 * i4_src_stride + 16)); // i = 4
127 src13_16x8b =
128 _mm_loadu_si128((__m128i *) (pu1_src + 5 * i4_src_stride + 16)); // i = 5
129 src14_16x8b =
130 _mm_loadu_si128((__m128i *) (pu1_src + 6 * i4_src_stride + 16)); // i = 6
131 src15_16x8b =
132 _mm_loadu_si128((__m128i *) (pu1_src + 7 * i4_src_stride + 16)); // i = 7
133
134 _mm_storeu_si128((__m128i *) (pu1_dst), src0_16x8b);
135 _mm_storeu_si128((__m128i *) (pu1_dst + i4_dst_stride), src1_16x8b);
136 _mm_storeu_si128((__m128i *) (pu1_dst + 2 * i4_dst_stride), src2_16x8b);
137 _mm_storeu_si128((__m128i *) (pu1_dst + 3 * i4_dst_stride), src3_16x8b);
138 _mm_storeu_si128((__m128i *) (pu1_dst + 4 * i4_dst_stride), src4_16x8b);
139 _mm_storeu_si128((__m128i *) (pu1_dst + 5 * i4_dst_stride), src5_16x8b);
140 _mm_storeu_si128((__m128i *) (pu1_dst + 6 * i4_dst_stride), src6_16x8b);
141 _mm_storeu_si128((__m128i *) (pu1_dst + 7 * i4_dst_stride), src7_16x8b);
142
143 _mm_storeu_si128((__m128i *) (pu1_dst + 16), src8_16x8b);
144 _mm_storeu_si128((__m128i *) (pu1_dst + i4_dst_stride + 16), src9_16x8b);
145 _mm_storeu_si128((__m128i *) (pu1_dst + 2 * i4_dst_stride + 16), src10_16x8b);
146 _mm_storeu_si128((__m128i *) (pu1_dst + 3 * i4_dst_stride + 16), src11_16x8b);
147 _mm_storeu_si128((__m128i *) (pu1_dst + 4 * i4_dst_stride + 16), src12_16x8b);
148 _mm_storeu_si128((__m128i *) (pu1_dst + 5 * i4_dst_stride + 16), src13_16x8b);
149 _mm_storeu_si128((__m128i *) (pu1_dst + 6 * i4_dst_stride + 16), src14_16x8b);
150 _mm_storeu_si128((__m128i *) (pu1_dst + 7 * i4_dst_stride + 16), src15_16x8b);
151
152 pu1_src += 32;
153 pu1_dst += 32;
154 }
155
156 pu1_src = pu1_src - i4_blk_wd + 8 * i4_src_stride;
157 pu1_dst = pu1_dst - i4_blk_wd + 8 * i4_dst_stride;
158 }
159 }
160 else /* ht multiple of 4 case */
161 {
162 for(i = 0; i < i4_blk_ht; i += 4)
163 {
164 for(j = 0; j < i4_blk_wd; j += 32)
165 {
166 src0_16x8b =
167 _mm_loadu_si128((__m128i *) (pu1_src)); // i = 0
168 src1_16x8b =
169 _mm_loadu_si128((__m128i *) (pu1_src + i4_src_stride)); // i = 1
170 src2_16x8b =
171 _mm_loadu_si128((__m128i *) (pu1_src + 2 * i4_src_stride)); // i = 2
172 src3_16x8b =
173 _mm_loadu_si128((__m128i *) (pu1_src + 3 * i4_src_stride)); // i = 3
174 /* Add 16 as offset */
175 src4_16x8b =
176 _mm_loadu_si128((__m128i *) (pu1_src + 16)); // i = 0
177 src5_16x8b =
178 _mm_loadu_si128((__m128i *) (pu1_src + i4_src_stride + 16)); // i = 1
179 src6_16x8b =
180 _mm_loadu_si128((__m128i *) (pu1_src + 2 * i4_src_stride + 16)); // i = 2
181 src7_16x8b =
182 _mm_loadu_si128((__m128i *) (pu1_src + 3 * i4_src_stride + 16)); // i = 3
183
184 _mm_storeu_si128((__m128i *) (pu1_dst), src0_16x8b);
185 _mm_storeu_si128((__m128i *) (pu1_dst + i4_dst_stride), src1_16x8b);
186 _mm_storeu_si128((__m128i *) (pu1_dst + 2 * i4_dst_stride), src2_16x8b);
187 _mm_storeu_si128((__m128i *) (pu1_dst + 3 * i4_dst_stride), src3_16x8b);
188 _mm_storeu_si128((__m128i *) (pu1_dst + 16), src4_16x8b);
189 _mm_storeu_si128((__m128i *) (pu1_dst + i4_dst_stride + 16), src5_16x8b);
190 _mm_storeu_si128((__m128i *) (pu1_dst + 2 * i4_dst_stride + 16), src6_16x8b);
191 _mm_storeu_si128((__m128i *) (pu1_dst + 3 * i4_dst_stride + 16), src7_16x8b);
192
193 pu1_src += 32;
194 pu1_dst += 32;
195 }
196
197 pu1_src = pu1_src - i4_blk_wd + 4 * i4_src_stride;
198 pu1_dst = pu1_dst - i4_blk_wd + 4 * i4_dst_stride;
199 }
200 }
201 }
202 else if(0 == (i4_blk_wd & 15)) /* wd multiple of 16 case */
203 {
204 __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
205
206 if(0 == (i4_blk_ht & 7)) /* ht multiple of 8 case */
207 {
208 __m128i src4_16x8b, src5_16x8b, src6_16x8b, src7_16x8b;
209
210 for(i = 0; i < i4_blk_ht; i += 8)
211 {
212 for(j = 0; j < i4_blk_wd; j += 16)
213 {
214 src0_16x8b =
215 _mm_loadu_si128((__m128i *) (pu1_src + 0 * i4_src_stride)); // i = 0
216 src1_16x8b =
217 _mm_loadu_si128((__m128i *) (pu1_src + 1 * i4_src_stride)); // i = 1
218 src2_16x8b =
219 _mm_loadu_si128((__m128i *) (pu1_src + 2 * i4_src_stride)); // i = 2
220 src3_16x8b =
221 _mm_loadu_si128((__m128i *) (pu1_src + 3 * i4_src_stride)); // i = 3
222 src4_16x8b =
223 _mm_loadu_si128((__m128i *) (pu1_src + 4 * i4_src_stride)); // i = 4
224 src5_16x8b =
225 _mm_loadu_si128((__m128i *) (pu1_src + 5 * i4_src_stride)); // i = 5
226 src6_16x8b =
227 _mm_loadu_si128((__m128i *) (pu1_src + 6 * i4_src_stride)); // i = 6
228 src7_16x8b =
229 _mm_loadu_si128((__m128i *) (pu1_src + 7 * i4_src_stride)); // i = 7
230
231 _mm_storeu_si128((__m128i *) (pu1_dst + 0 * i4_dst_stride), src0_16x8b);
232 _mm_storeu_si128((__m128i *) (pu1_dst + 1 * i4_dst_stride), src1_16x8b);
233 _mm_storeu_si128((__m128i *) (pu1_dst + 2 * i4_dst_stride), src2_16x8b);
234 _mm_storeu_si128((__m128i *) (pu1_dst + 3 * i4_dst_stride), src3_16x8b);
235 _mm_storeu_si128((__m128i *) (pu1_dst + 4 * i4_dst_stride), src4_16x8b);
236 _mm_storeu_si128((__m128i *) (pu1_dst + 5 * i4_dst_stride), src5_16x8b);
237 _mm_storeu_si128((__m128i *) (pu1_dst + 6 * i4_dst_stride), src6_16x8b);
238 _mm_storeu_si128((__m128i *) (pu1_dst + 7 * i4_dst_stride), src7_16x8b);
239
240 pu1_src += 16;
241 pu1_dst += 16;
242 }
243
244 pu1_src = pu1_src - i4_blk_wd + 8 * i4_src_stride;
245 pu1_dst = pu1_dst - i4_blk_wd + 8 * i4_dst_stride;
246 }
247 }
248 else /* ht multiple of 4 case */
249 {
250 for(i = 0; i < i4_blk_ht; i += 4)
251 {
252 for(j = 0; j < i4_blk_wd; j += 16)
253 {
254 src0_16x8b =
255 _mm_loadu_si128((__m128i *) (pu1_src + 0 * i4_src_stride)); // i = 0
256 src1_16x8b =
257 _mm_loadu_si128((__m128i *) (pu1_src + 1 * i4_src_stride)); // i = 1
258 src2_16x8b =
259 _mm_loadu_si128((__m128i *) (pu1_src + 2 * i4_src_stride)); // i = 2
260 src3_16x8b =
261 _mm_loadu_si128((__m128i *) (pu1_src + 3 * i4_src_stride)); // i = 3
262
263 _mm_storeu_si128((__m128i *) (pu1_dst + 0 * i4_dst_stride), src0_16x8b);
264 _mm_storeu_si128((__m128i *) (pu1_dst + 1 * i4_dst_stride), src1_16x8b);
265 _mm_storeu_si128((__m128i *) (pu1_dst + 2 * i4_dst_stride), src2_16x8b);
266 _mm_storeu_si128((__m128i *) (pu1_dst + 3 * i4_dst_stride), src3_16x8b);
267
268 pu1_src += 16;
269 pu1_dst += 16;
270 }
271
272 pu1_src = pu1_src - i4_blk_wd + 4 * i4_src_stride;
273 pu1_dst = pu1_dst - i4_blk_wd + 4 * i4_dst_stride;
274 }
275 }
276 }
277 else if(0 == (i4_blk_wd & 7)) /* wd multiple of 8 case */
278 {
279 __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
280
281 if(0 == (i4_blk_ht & 7)) /* ht multiple of 8 case */
282 {
283 __m128i src4_16x8b, src5_16x8b, src6_16x8b, src7_16x8b;
284
285 for(i = 0; i < i4_blk_ht; i += 8)
286 {
287 for(j = 0; j < i4_blk_wd; j += 8)
288 {
289 src0_16x8b =
290 _mm_loadl_epi64((__m128i *) (pu1_src + 0 * i4_src_stride)); // i = 0
291 src1_16x8b =
292 _mm_loadl_epi64((__m128i *) (pu1_src + 1 * i4_src_stride)); // i = 1
293 src2_16x8b =
294 _mm_loadl_epi64((__m128i *) (pu1_src + 2 * i4_src_stride)); // i = 2
295 src3_16x8b =
296 _mm_loadl_epi64((__m128i *) (pu1_src + 3 * i4_src_stride)); // i = 3
297 src4_16x8b =
298 _mm_loadl_epi64((__m128i *) (pu1_src + 4 * i4_src_stride)); // i = 4
299 src5_16x8b =
300 _mm_loadl_epi64((__m128i *) (pu1_src + 5 * i4_src_stride)); // i = 5
301 src6_16x8b =
302 _mm_loadl_epi64((__m128i *) (pu1_src + 6 * i4_src_stride)); // i = 6
303 src7_16x8b =
304 _mm_loadl_epi64((__m128i *) (pu1_src + 7 * i4_src_stride)); // i = 7
305
306 _mm_storel_epi64((__m128i *) (pu1_dst + 0 * i4_dst_stride), src0_16x8b);
307 _mm_storel_epi64((__m128i *) (pu1_dst + 1 * i4_dst_stride), src1_16x8b);
308 _mm_storel_epi64((__m128i *) (pu1_dst + 2 * i4_dst_stride), src2_16x8b);
309 _mm_storel_epi64((__m128i *) (pu1_dst + 3 * i4_dst_stride), src3_16x8b);
310 _mm_storel_epi64((__m128i *) (pu1_dst + 4 * i4_dst_stride), src4_16x8b);
311 _mm_storel_epi64((__m128i *) (pu1_dst + 5 * i4_dst_stride), src5_16x8b);
312 _mm_storel_epi64((__m128i *) (pu1_dst + 6 * i4_dst_stride), src6_16x8b);
313 _mm_storel_epi64((__m128i *) (pu1_dst + 7 * i4_dst_stride), src7_16x8b);
314
315 pu1_src += 8;
316 pu1_dst += 8;
317 }
318
319 pu1_src = pu1_src - i4_blk_wd + 8 * i4_src_stride;
320 pu1_dst = pu1_dst - i4_blk_wd + 8 * i4_dst_stride;
321 }
322 }
323 else /* ht multiple of 4 case */
324 {
325 for(i = 0; i < i4_blk_ht; i += 4)
326 {
327 for(j = 0; j < i4_blk_wd; j += 8)
328 {
329 src0_16x8b =
330 _mm_loadl_epi64((__m128i *) (pu1_src + 0 * i4_src_stride)); // i = 0
331 src1_16x8b =
332 _mm_loadl_epi64((__m128i *) (pu1_src + 1 * i4_src_stride)); // i = 1
333 src2_16x8b =
334 _mm_loadl_epi64((__m128i *) (pu1_src + 2 * i4_src_stride)); // i = 2
335 src3_16x8b =
336 _mm_loadl_epi64((__m128i *) (pu1_src + 3 * i4_src_stride)); // i = 3
337
338 _mm_storel_epi64((__m128i *) (pu1_dst + 0 * i4_dst_stride), src0_16x8b);
339 _mm_storel_epi64((__m128i *) (pu1_dst + 1 * i4_dst_stride), src1_16x8b);
340 _mm_storel_epi64((__m128i *) (pu1_dst + 2 * i4_dst_stride), src2_16x8b);
341 _mm_storel_epi64((__m128i *) (pu1_dst + 3 * i4_dst_stride), src3_16x8b);
342
343 pu1_src += 8;
344 pu1_dst += 8;
345 }
346
347 pu1_src = pu1_src - i4_blk_wd + 4 * i4_src_stride;
348 pu1_dst = pu1_dst - i4_blk_wd + 4 * i4_dst_stride;
349 }
350 }
351 }
352 else /* wd multiple of 4 case */
353 {
354 __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
355 WORD32 src0, src1, src2, src3;
356 if(0 == (i4_blk_ht & 7)) /* ht multiple of 8 case */
357 {
358 __m128i src4_16x8b, src5_16x8b, src6_16x8b, src7_16x8b;
359 WORD32 src4, src5, src6, src7;
360
361 for(i = 0; i < i4_blk_ht; i += 8)
362 {
363 for(j = 0; j < i4_blk_wd; j += 4)
364 {
365 src0_16x8b =
366 _mm_loadl_epi64((__m128i *) (pu1_src + 0 * i4_src_stride)); // i = 0
367 src1_16x8b =
368 _mm_loadl_epi64((__m128i *) (pu1_src + 1 * i4_src_stride)); // i = 1
369 src2_16x8b =
370 _mm_loadl_epi64((__m128i *) (pu1_src + 2 * i4_src_stride)); // i = 2
371 src3_16x8b =
372 _mm_loadl_epi64((__m128i *) (pu1_src + 3 * i4_src_stride)); // i = 3
373 src4_16x8b =
374 _mm_loadl_epi64((__m128i *) (pu1_src + 4 * i4_src_stride)); // i = 4
375 src5_16x8b =
376 _mm_loadl_epi64((__m128i *) (pu1_src + 5 * i4_src_stride)); // i = 5
377 src6_16x8b =
378 _mm_loadl_epi64((__m128i *) (pu1_src + 6 * i4_src_stride)); // i = 6
379 src7_16x8b =
380 _mm_loadl_epi64((__m128i *) (pu1_src + 7 * i4_src_stride)); // i = 7
381
382 src0 = _mm_cvtsi128_si32(src0_16x8b);
383 src1 = _mm_cvtsi128_si32(src1_16x8b);
384 src2 = _mm_cvtsi128_si32(src2_16x8b);
385 src3 = _mm_cvtsi128_si32(src3_16x8b);
386 src4 = _mm_cvtsi128_si32(src4_16x8b);
387 src5 = _mm_cvtsi128_si32(src5_16x8b);
388 src6 = _mm_cvtsi128_si32(src6_16x8b);
389 src7 = _mm_cvtsi128_si32(src7_16x8b);
390
391 *(WORD32 *) (&pu1_dst[0 * i4_dst_stride]) = src0;
392 *(WORD32 *) (&pu1_dst[1 * i4_dst_stride]) = src1;
393 *(WORD32 *) (&pu1_dst[2 * i4_dst_stride]) = src2;
394 *(WORD32 *) (&pu1_dst[3 * i4_dst_stride]) = src3;
395 *(WORD32 *) (&pu1_dst[4 * i4_dst_stride]) = src4;
396 *(WORD32 *) (&pu1_dst[5 * i4_dst_stride]) = src5;
397 *(WORD32 *) (&pu1_dst[6 * i4_dst_stride]) = src6;
398 *(WORD32 *) (&pu1_dst[7 * i4_dst_stride]) = src7;
399
400 pu1_src += 4;
401 pu1_dst += 4;
402 }
403
404 pu1_src = pu1_src - i4_blk_wd + 8 * i4_src_stride;
405 pu1_dst = pu1_dst - i4_blk_wd + 8 * i4_dst_stride;
406 }
407 }
408 else /* ht multiple of 4 case */
409 {
410 for(i = 0; i < i4_blk_ht; i += 4)
411 {
412 for(j = 0; j < i4_blk_wd; j += 4)
413 {
414 src0_16x8b =
415 _mm_loadl_epi64((__m128i *) (pu1_src + 0 * i4_src_stride)); // i = 0
416 src1_16x8b =
417 _mm_loadl_epi64((__m128i *) (pu1_src + 1 * i4_src_stride)); // i = 1
418 src2_16x8b =
419 _mm_loadl_epi64((__m128i *) (pu1_src + 2 * i4_src_stride)); // i = 2
420 src3_16x8b =
421 _mm_loadl_epi64((__m128i *) (pu1_src + 3 * i4_src_stride)); // i = 3
422
423 src0 = _mm_cvtsi128_si32(src0_16x8b);
424 src1 = _mm_cvtsi128_si32(src1_16x8b);
425 src2 = _mm_cvtsi128_si32(src2_16x8b);
426 src3 = _mm_cvtsi128_si32(src3_16x8b);
427
428 *(WORD32 *) (&pu1_dst[0 * i4_dst_stride]) = src0;
429 *(WORD32 *) (&pu1_dst[1 * i4_dst_stride]) = src1;
430 *(WORD32 *) (&pu1_dst[2 * i4_dst_stride]) = src2;
431 *(WORD32 *) (&pu1_dst[3 * i4_dst_stride]) = src3;
432
433 pu1_src += 4;
434 pu1_dst += 4;
435 }
436
437 pu1_src = pu1_src - i4_blk_wd + 4 * i4_src_stride;
438 pu1_dst = pu1_dst - i4_blk_wd + 4 * i4_dst_stride;
439 }
440 }
441 }
442 }
443