1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h>
13
14 #include "config/aom_dsp_rtcd.h"
15
copy_128(const uint8_t * src,uint8_t * dst)16 static inline void copy_128(const uint8_t *src, uint8_t *dst) {
17 __m256i s[4];
18 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
19 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
20 s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
21 s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
22 _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
23 _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
24 _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
25 _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
26 }
27
aom_convolve_copy_avx2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h)28 void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride,
29 uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
30 // The w == 16 case uses _mm_store_si128(), which requires its output address
31 // be aligned on a 16-byte boundary.
32 if (w == 16) {
33 assert(!((intptr_t)dst % 16));
34 assert(!(dst_stride % 16));
35 }
36
37 if (w == 2) {
38 do {
39 memmove(dst, src, 2 * sizeof(*src));
40 src += src_stride;
41 dst += dst_stride;
42 memmove(dst, src, 2 * sizeof(*src));
43 src += src_stride;
44 dst += dst_stride;
45 h -= 2;
46 } while (h);
47 } else if (w == 4) {
48 do {
49 memmove(dst, src, 4 * sizeof(*src));
50 src += src_stride;
51 dst += dst_stride;
52 memmove(dst, src, 4 * sizeof(*src));
53 src += src_stride;
54 dst += dst_stride;
55 h -= 2;
56 } while (h);
57 } else if (w == 8) {
58 do {
59 __m128i s[2];
60 s[0] = _mm_loadl_epi64((__m128i *)src);
61 src += src_stride;
62 s[1] = _mm_loadl_epi64((__m128i *)src);
63 src += src_stride;
64 _mm_storel_epi64((__m128i *)dst, s[0]);
65 dst += dst_stride;
66 _mm_storel_epi64((__m128i *)dst, s[1]);
67 dst += dst_stride;
68 h -= 2;
69 } while (h);
70 } else if (w == 16) {
71 do {
72 __m128i s[2];
73 s[0] = _mm_loadu_si128((__m128i *)src);
74 src += src_stride;
75 s[1] = _mm_loadu_si128((__m128i *)src);
76 src += src_stride;
77 _mm_store_si128((__m128i *)dst, s[0]);
78 dst += dst_stride;
79 _mm_store_si128((__m128i *)dst, s[1]);
80 dst += dst_stride;
81 h -= 2;
82 } while (h);
83 } else if (w == 32) {
84 do {
85 __m256i s[2];
86 s[0] = _mm256_loadu_si256((__m256i *)src);
87 src += src_stride;
88 s[1] = _mm256_loadu_si256((__m256i *)src);
89 src += src_stride;
90 _mm256_storeu_si256((__m256i *)dst, s[0]);
91 dst += dst_stride;
92 _mm256_storeu_si256((__m256i *)dst, s[1]);
93 dst += dst_stride;
94 h -= 2;
95 } while (h);
96 } else if (w == 64) {
97 do {
98 __m256i s[4];
99 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
100 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
101 src += src_stride;
102 s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
103 s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
104 src += src_stride;
105 _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
106 _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
107 dst += dst_stride;
108 _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
109 _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
110 dst += dst_stride;
111 h -= 2;
112 } while (h);
113 } else {
114 do {
115 copy_128(src, dst);
116 src += src_stride;
117 dst += dst_stride;
118 copy_128(src, dst);
119 src += src_stride;
120 dst += dst_stride;
121 h -= 2;
122 } while (h);
123 }
124 }
125
126 #if CONFIG_AV1_HIGHBITDEPTH
127
highbd_copy_64(const uint16_t * src,uint16_t * dst)128 static inline void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
129 __m256i s[4];
130 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
131 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
132 s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
133 s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
134 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
135 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
136 _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
137 _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
138 }
139
highbd_copy_128(const uint16_t * src,uint16_t * dst)140 static inline void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
141 __m256i s[8];
142 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
143 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
144 s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
145 s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
146 s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
147 s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
148 s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
149 s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
150
151 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
152 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
153 _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
154 _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
155 _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
156 _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
157 _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
158 _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
159 }
160
aom_highbd_convolve_copy_avx2(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h)161 void aom_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
162 uint16_t *dst, ptrdiff_t dst_stride, int w,
163 int h) {
164 // The w == 8 case uses _mm_store_si128(), which requires its output address
165 // be aligned on a 16-byte boundary.
166 if (w == 8) {
167 assert(!((intptr_t)dst % 16));
168 assert(!(dst_stride % 8));
169 }
170
171 if (w == 2) {
172 do {
173 memmove(dst, src, 2 * sizeof(*src));
174 src += src_stride;
175 dst += dst_stride;
176 memmove(dst, src, 2 * sizeof(*src));
177 src += src_stride;
178 dst += dst_stride;
179 h -= 2;
180 } while (h);
181 } else if (w == 4) {
182 do {
183 __m128i s[2];
184 s[0] = _mm_loadl_epi64((__m128i *)src);
185 src += src_stride;
186 s[1] = _mm_loadl_epi64((__m128i *)src);
187 src += src_stride;
188 _mm_storel_epi64((__m128i *)dst, s[0]);
189 dst += dst_stride;
190 _mm_storel_epi64((__m128i *)dst, s[1]);
191 dst += dst_stride;
192 h -= 2;
193 } while (h);
194 } else if (w == 8) {
195 do {
196 __m128i s[2];
197 s[0] = _mm_loadu_si128((__m128i *)src);
198 src += src_stride;
199 s[1] = _mm_loadu_si128((__m128i *)src);
200 src += src_stride;
201 _mm_store_si128((__m128i *)dst, s[0]);
202 dst += dst_stride;
203 _mm_store_si128((__m128i *)dst, s[1]);
204 dst += dst_stride;
205 h -= 2;
206 } while (h);
207 } else if (w == 16) {
208 do {
209 __m256i s[2];
210 s[0] = _mm256_loadu_si256((__m256i *)src);
211 src += src_stride;
212 s[1] = _mm256_loadu_si256((__m256i *)src);
213 src += src_stride;
214 _mm256_storeu_si256((__m256i *)dst, s[0]);
215 dst += dst_stride;
216 _mm256_storeu_si256((__m256i *)dst, s[1]);
217 dst += dst_stride;
218 h -= 2;
219 } while (h);
220 } else if (w == 32) {
221 do {
222 __m256i s[4];
223 s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
224 s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
225 src += src_stride;
226 s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
227 s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
228 src += src_stride;
229 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
230 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
231 dst += dst_stride;
232 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
233 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
234 dst += dst_stride;
235 h -= 2;
236 } while (h);
237 } else if (w == 64) {
238 do {
239 highbd_copy_64(src, dst);
240 src += src_stride;
241 dst += dst_stride;
242 highbd_copy_64(src, dst);
243 src += src_stride;
244 dst += dst_stride;
245 h -= 2;
246 } while (h);
247 } else {
248 assert(w == 128);
249 do {
250 highbd_copy_128(src, dst);
251 src += src_stride;
252 dst += dst_stride;
253 highbd_copy_128(src, dst);
254 src += src_stride;
255 dst += dst_stride;
256 h -= 2;
257 } while (h);
258 }
259 }
260
261 #endif // CONFIG_AV1_HIGHBITDEPTH
262