xref: /aosp_15_r20/external/libaom/aom_dsp/x86/aom_convolve_copy_sse2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 
copy_128(const uint8_t * src,uint8_t * dst)16 static inline void copy_128(const uint8_t *src, uint8_t *dst) {
17   __m128i s[8];
18   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
19   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
20   s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
21   s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
22   s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
23   s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
24   s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
25   s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
26   _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
27   _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
28   _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
29   _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
30   _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
31   _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
32   _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
33   _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
34 }
35 
aom_convolve_copy_sse2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h)36 void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
37                             uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
38   // The w >= 16 cases use _mm_store_si128(), which requires its output address
39   // be aligned on a 16-byte boundary.
40   if (w >= 16) {
41     assert(!((intptr_t)dst % 16));
42     assert(!(dst_stride % 16));
43   }
44 
45   if (w == 2) {
46     do {
47       memmove(dst, src, 2 * sizeof(*src));
48       src += src_stride;
49       dst += dst_stride;
50       memmove(dst, src, 2 * sizeof(*src));
51       src += src_stride;
52       dst += dst_stride;
53       h -= 2;
54     } while (h);
55   } else if (w == 4) {
56     do {
57       memmove(dst, src, 4 * sizeof(*src));
58       src += src_stride;
59       dst += dst_stride;
60       memmove(dst, src, 4 * sizeof(*src));
61       src += src_stride;
62       dst += dst_stride;
63       h -= 2;
64     } while (h);
65   } else if (w == 8) {
66     do {
67       __m128i s[2];
68       s[0] = _mm_loadl_epi64((__m128i *)src);
69       src += src_stride;
70       s[1] = _mm_loadl_epi64((__m128i *)src);
71       src += src_stride;
72       _mm_storel_epi64((__m128i *)dst, s[0]);
73       dst += dst_stride;
74       _mm_storel_epi64((__m128i *)dst, s[1]);
75       dst += dst_stride;
76       h -= 2;
77     } while (h);
78   } else if (w == 16) {
79     do {
80       __m128i s[2];
81       s[0] = _mm_loadu_si128((__m128i *)src);
82       src += src_stride;
83       s[1] = _mm_loadu_si128((__m128i *)src);
84       src += src_stride;
85       _mm_store_si128((__m128i *)dst, s[0]);
86       dst += dst_stride;
87       _mm_store_si128((__m128i *)dst, s[1]);
88       dst += dst_stride;
89       h -= 2;
90     } while (h);
91   } else if (w == 32) {
92     do {
93       __m128i s[4];
94       s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
95       s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
96       src += src_stride;
97       s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
98       s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
99       src += src_stride;
100       _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
101       _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
102       dst += dst_stride;
103       _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
104       _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
105       dst += dst_stride;
106       h -= 2;
107     } while (h);
108   } else if (w == 64) {
109     do {
110       __m128i s[8];
111       s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
112       s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
113       s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
114       s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
115       src += src_stride;
116       s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
117       s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
118       s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
119       s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
120       src += src_stride;
121       _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
122       _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
123       _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
124       _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
125       dst += dst_stride;
126       _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
127       _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
128       _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
129       _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
130       dst += dst_stride;
131       h -= 2;
132     } while (h);
133   } else {
134     do {
135       copy_128(src, dst);
136       src += src_stride;
137       dst += dst_stride;
138       copy_128(src, dst);
139       src += src_stride;
140       dst += dst_stride;
141       h -= 2;
142     } while (h);
143   }
144 }
145 
146 #if CONFIG_AV1_HIGHBITDEPTH
highbd_copy_64(const uint16_t * src,uint16_t * dst)147 static inline void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
148   __m128i s[8];
149   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
150   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
151   s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
152   s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
153   s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
154   s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
155   s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
156   s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
157   _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
158   _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
159   _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
160   _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
161   _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
162   _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
163   _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
164   _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
165 }
166 
highbd_copy_128(const uint16_t * src,uint16_t * dst)167 static inline void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
168   __m128i s[16];
169   s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
170   s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
171   s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
172   s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
173   s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
174   s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
175   s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
176   s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
177   s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
178   s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
179   s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
180   s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
181   s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
182   s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
183   s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
184   s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
185   _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
186   _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
187   _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
188   _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
189   _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
190   _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
191   _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
192   _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
193   _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
194   _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
195   _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
196   _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
197   _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
198   _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
199   _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
200   _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
201 }
202 
aom_highbd_convolve_copy_sse2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h)203 void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride,
204                                    uint16_t *dst, ptrdiff_t dst_stride, int w,
205                                    int h) {
206   // The w >= 8 cases use _mm_store_si128(), which requires its output address
207   // be aligned on a 16-byte boundary.
208   if (w >= 8) {
209     assert(!((intptr_t)dst % 16));
210     assert(!(dst_stride % 8));
211   }
212 
213   if (w == 2) {
214     do {
215       __m128i s = _mm_loadl_epi64((__m128i *)src);
216       *(int *)dst = _mm_cvtsi128_si32(s);
217       src += src_stride;
218       dst += dst_stride;
219       s = _mm_loadl_epi64((__m128i *)src);
220       *(int *)dst = _mm_cvtsi128_si32(s);
221       src += src_stride;
222       dst += dst_stride;
223       h -= 2;
224     } while (h);
225   } else if (w == 4) {
226     do {
227       __m128i s[2];
228       s[0] = _mm_loadl_epi64((__m128i *)src);
229       src += src_stride;
230       s[1] = _mm_loadl_epi64((__m128i *)src);
231       src += src_stride;
232       _mm_storel_epi64((__m128i *)dst, s[0]);
233       dst += dst_stride;
234       _mm_storel_epi64((__m128i *)dst, s[1]);
235       dst += dst_stride;
236       h -= 2;
237     } while (h);
238   } else if (w == 8) {
239     do {
240       __m128i s[2];
241       s[0] = _mm_loadu_si128((__m128i *)src);
242       src += src_stride;
243       s[1] = _mm_loadu_si128((__m128i *)src);
244       src += src_stride;
245       _mm_store_si128((__m128i *)dst, s[0]);
246       dst += dst_stride;
247       _mm_store_si128((__m128i *)dst, s[1]);
248       dst += dst_stride;
249       h -= 2;
250     } while (h);
251   } else if (w == 16) {
252     do {
253       __m128i s[4];
254       s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
255       s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
256       src += src_stride;
257       s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
258       s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
259       src += src_stride;
260       _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
261       _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
262       dst += dst_stride;
263       _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
264       _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
265       dst += dst_stride;
266       h -= 2;
267     } while (h);
268   } else if (w == 32) {
269     do {
270       __m128i s[8];
271       s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
272       s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
273       s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
274       s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
275       src += src_stride;
276       s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
277       s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
278       s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
279       s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
280       src += src_stride;
281       _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
282       _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
283       _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
284       _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
285       dst += dst_stride;
286       _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
287       _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
288       _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
289       _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
290       dst += dst_stride;
291       h -= 2;
292     } while (h);
293   } else if (w == 64) {
294     do {
295       highbd_copy_64(src, dst);
296       src += src_stride;
297       dst += dst_stride;
298       highbd_copy_64(src, dst);
299       src += src_stride;
300       dst += dst_stride;
301       h -= 2;
302     } while (h);
303   } else {
304     do {
305       highbd_copy_128(src, dst);
306       src += src_stride;
307       dst += dst_stride;
308       highbd_copy_128(src, dst);
309       src += src_stride;
310       dst += dst_stride;
311       h -= 2;
312     } while (h);
313   }
314 }
315 #endif  // CONFIG_AV1_HIGHBITDEPTH
316