1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h>
13
14 #include "config/aom_dsp_rtcd.h"
15
copy_128(const uint8_t * src,uint8_t * dst)16 static inline void copy_128(const uint8_t *src, uint8_t *dst) {
17 __m128i s[8];
18 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
19 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
20 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
21 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
22 s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
23 s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
24 s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
25 s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
26 _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
27 _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
28 _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
29 _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
30 _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
31 _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
32 _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
33 _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
34 }
35
aom_convolve_copy_sse2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h)36 void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
37 uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
38 // The w >= 16 cases use _mm_store_si128(), which requires its output address
39 // be aligned on a 16-byte boundary.
40 if (w >= 16) {
41 assert(!((intptr_t)dst % 16));
42 assert(!(dst_stride % 16));
43 }
44
45 if (w == 2) {
46 do {
47 memmove(dst, src, 2 * sizeof(*src));
48 src += src_stride;
49 dst += dst_stride;
50 memmove(dst, src, 2 * sizeof(*src));
51 src += src_stride;
52 dst += dst_stride;
53 h -= 2;
54 } while (h);
55 } else if (w == 4) {
56 do {
57 memmove(dst, src, 4 * sizeof(*src));
58 src += src_stride;
59 dst += dst_stride;
60 memmove(dst, src, 4 * sizeof(*src));
61 src += src_stride;
62 dst += dst_stride;
63 h -= 2;
64 } while (h);
65 } else if (w == 8) {
66 do {
67 __m128i s[2];
68 s[0] = _mm_loadl_epi64((__m128i *)src);
69 src += src_stride;
70 s[1] = _mm_loadl_epi64((__m128i *)src);
71 src += src_stride;
72 _mm_storel_epi64((__m128i *)dst, s[0]);
73 dst += dst_stride;
74 _mm_storel_epi64((__m128i *)dst, s[1]);
75 dst += dst_stride;
76 h -= 2;
77 } while (h);
78 } else if (w == 16) {
79 do {
80 __m128i s[2];
81 s[0] = _mm_loadu_si128((__m128i *)src);
82 src += src_stride;
83 s[1] = _mm_loadu_si128((__m128i *)src);
84 src += src_stride;
85 _mm_store_si128((__m128i *)dst, s[0]);
86 dst += dst_stride;
87 _mm_store_si128((__m128i *)dst, s[1]);
88 dst += dst_stride;
89 h -= 2;
90 } while (h);
91 } else if (w == 32) {
92 do {
93 __m128i s[4];
94 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
95 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
96 src += src_stride;
97 s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
98 s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
99 src += src_stride;
100 _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
101 _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
102 dst += dst_stride;
103 _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
104 _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
105 dst += dst_stride;
106 h -= 2;
107 } while (h);
108 } else if (w == 64) {
109 do {
110 __m128i s[8];
111 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
112 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
113 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
114 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
115 src += src_stride;
116 s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
117 s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
118 s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
119 s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
120 src += src_stride;
121 _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
122 _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
123 _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
124 _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
125 dst += dst_stride;
126 _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
127 _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
128 _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
129 _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
130 dst += dst_stride;
131 h -= 2;
132 } while (h);
133 } else {
134 do {
135 copy_128(src, dst);
136 src += src_stride;
137 dst += dst_stride;
138 copy_128(src, dst);
139 src += src_stride;
140 dst += dst_stride;
141 h -= 2;
142 } while (h);
143 }
144 }
145
146 #if CONFIG_AV1_HIGHBITDEPTH
highbd_copy_64(const uint16_t * src,uint16_t * dst)147 static inline void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
148 __m128i s[8];
149 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
150 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
151 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
152 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
153 s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
154 s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
155 s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
156 s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
157 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
158 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
159 _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
160 _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
161 _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
162 _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
163 _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
164 _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
165 }
166
highbd_copy_128(const uint16_t * src,uint16_t * dst)167 static inline void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
168 __m128i s[16];
169 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
170 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
171 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
172 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
173 s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
174 s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
175 s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
176 s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
177 s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
178 s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
179 s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
180 s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
181 s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
182 s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
183 s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
184 s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
185 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
186 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
187 _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
188 _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
189 _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
190 _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
191 _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
192 _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
193 _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
194 _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
195 _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
196 _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
197 _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
198 _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
199 _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
200 _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
201 }
202
aom_highbd_convolve_copy_sse2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h)203 void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride,
204 uint16_t *dst, ptrdiff_t dst_stride, int w,
205 int h) {
206 // The w >= 8 cases use _mm_store_si128(), which requires its output address
207 // be aligned on a 16-byte boundary.
208 if (w >= 8) {
209 assert(!((intptr_t)dst % 16));
210 assert(!(dst_stride % 8));
211 }
212
213 if (w == 2) {
214 do {
215 __m128i s = _mm_loadl_epi64((__m128i *)src);
216 *(int *)dst = _mm_cvtsi128_si32(s);
217 src += src_stride;
218 dst += dst_stride;
219 s = _mm_loadl_epi64((__m128i *)src);
220 *(int *)dst = _mm_cvtsi128_si32(s);
221 src += src_stride;
222 dst += dst_stride;
223 h -= 2;
224 } while (h);
225 } else if (w == 4) {
226 do {
227 __m128i s[2];
228 s[0] = _mm_loadl_epi64((__m128i *)src);
229 src += src_stride;
230 s[1] = _mm_loadl_epi64((__m128i *)src);
231 src += src_stride;
232 _mm_storel_epi64((__m128i *)dst, s[0]);
233 dst += dst_stride;
234 _mm_storel_epi64((__m128i *)dst, s[1]);
235 dst += dst_stride;
236 h -= 2;
237 } while (h);
238 } else if (w == 8) {
239 do {
240 __m128i s[2];
241 s[0] = _mm_loadu_si128((__m128i *)src);
242 src += src_stride;
243 s[1] = _mm_loadu_si128((__m128i *)src);
244 src += src_stride;
245 _mm_store_si128((__m128i *)dst, s[0]);
246 dst += dst_stride;
247 _mm_store_si128((__m128i *)dst, s[1]);
248 dst += dst_stride;
249 h -= 2;
250 } while (h);
251 } else if (w == 16) {
252 do {
253 __m128i s[4];
254 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
255 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
256 src += src_stride;
257 s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
258 s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
259 src += src_stride;
260 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
261 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
262 dst += dst_stride;
263 _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
264 _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
265 dst += dst_stride;
266 h -= 2;
267 } while (h);
268 } else if (w == 32) {
269 do {
270 __m128i s[8];
271 s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
272 s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
273 s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
274 s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
275 src += src_stride;
276 s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
277 s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
278 s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
279 s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
280 src += src_stride;
281 _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
282 _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
283 _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
284 _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
285 dst += dst_stride;
286 _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
287 _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
288 _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
289 _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
290 dst += dst_stride;
291 h -= 2;
292 } while (h);
293 } else if (w == 64) {
294 do {
295 highbd_copy_64(src, dst);
296 src += src_stride;
297 dst += dst_stride;
298 highbd_copy_64(src, dst);
299 src += src_stride;
300 dst += dst_stride;
301 h -= 2;
302 } while (h);
303 } else {
304 do {
305 highbd_copy_128(src, dst);
306 src += src_stride;
307 dst += dst_stride;
308 highbd_copy_128(src, dst);
309 src += src_stride;
310 dst += dst_stride;
311 h -= 2;
312 } while (h);
313 }
314 }
315 #endif // CONFIG_AV1_HIGHBITDEPTH
316