1 /*
2 * Copyright © 2018-2021, VideoLAN and dav1d authors
3 * Copyright © 2018-2021, Two Orioles, LLC
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "src/cpu.h"
29 #include "src/mc.h"
30
31 #define decl_fn(type, name) \
32 decl_##type##_fn(BF(name, ssse3)); \
33 decl_##type##_fn(BF(name, avx2)); \
34 decl_##type##_fn(BF(name, avx512icl));
35 #define init_mc_fn(type, name, suffix) \
36 c->mc[type] = BF(dav1d_put_##name, suffix)
37 #define init_mct_fn(type, name, suffix) \
38 c->mct[type] = BF(dav1d_prep_##name, suffix)
39 #define init_mc_scaled_fn(type, name, suffix) \
40 c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
41 #define init_mct_scaled_fn(type, name, suffix) \
42 c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
43
44 decl_fn(mc, dav1d_put_8tap_regular);
45 decl_fn(mc, dav1d_put_8tap_regular_smooth);
46 decl_fn(mc, dav1d_put_8tap_regular_sharp);
47 decl_fn(mc, dav1d_put_8tap_smooth);
48 decl_fn(mc, dav1d_put_8tap_smooth_regular);
49 decl_fn(mc, dav1d_put_8tap_smooth_sharp);
50 decl_fn(mc, dav1d_put_8tap_sharp);
51 decl_fn(mc, dav1d_put_8tap_sharp_regular);
52 decl_fn(mc, dav1d_put_8tap_sharp_smooth);
53 decl_fn(mc, dav1d_put_bilin);
54
55 decl_fn(mct, dav1d_prep_8tap_regular);
56 decl_fn(mct, dav1d_prep_8tap_regular_smooth);
57 decl_fn(mct, dav1d_prep_8tap_regular_sharp);
58 decl_fn(mct, dav1d_prep_8tap_smooth);
59 decl_fn(mct, dav1d_prep_8tap_smooth_regular);
60 decl_fn(mct, dav1d_prep_8tap_smooth_sharp);
61 decl_fn(mct, dav1d_prep_8tap_sharp);
62 decl_fn(mct, dav1d_prep_8tap_sharp_regular);
63 decl_fn(mct, dav1d_prep_8tap_sharp_smooth);
64 decl_fn(mct, dav1d_prep_bilin);
65
66 decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular);
67 decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_smooth);
68 decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_sharp);
69 decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth);
70 decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_regular);
71 decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_sharp);
72 decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp);
73 decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_regular);
74 decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_smooth);
75 decl_fn(mc_scaled, dav1d_put_bilin_scaled);
76
77 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular);
78 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_smooth);
79 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_sharp);
80 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth);
81 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_regular);
82 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_sharp);
83 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp);
84 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_regular);
85 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_smooth);
86 decl_fn(mct_scaled, dav1d_prep_bilin_scaled);
87
88 decl_fn(avg, dav1d_avg);
89 decl_fn(w_avg, dav1d_w_avg);
90 decl_fn(mask, dav1d_mask);
91 decl_fn(w_mask, dav1d_w_mask_420);
92 decl_fn(w_mask, dav1d_w_mask_422);
93 decl_fn(w_mask, dav1d_w_mask_444);
94 decl_fn(blend, dav1d_blend);
95 decl_fn(blend_dir, dav1d_blend_v);
96 decl_fn(blend_dir, dav1d_blend_h);
97
98 decl_fn(warp8x8, dav1d_warp_affine_8x8);
99 decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4));
100 decl_fn(warp8x8t, dav1d_warp_affine_8x8t);
101 decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4));
102
103 decl_fn(emu_edge, dav1d_emu_edge);
104
105 decl_fn(resize, dav1d_resize);
106
mc_dsp_init_x86(Dav1dMCDSPContext * const c)107 static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) {
108 const unsigned flags = dav1d_get_cpu_flags();
109
110 if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
111 return;
112
113 init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
114 init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
115 init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
116 init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
117 init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
118 init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
119 init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
120 init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
121 init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
122 init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3);
123
124 init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
125 init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
126 init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
127 init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
128 init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
129 init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
130 init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
131 init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
132 init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
133 init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
134
135 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
136 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
137 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
138 init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
139 init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
140 init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
141 init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
142 init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
143 init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
144 init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
145
146 init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
147 init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
148 init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
149 init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
150 init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
151 init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
152 init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
153 init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
154 init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
155 init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
156
157 c->avg = BF(dav1d_avg, ssse3);
158 c->w_avg = BF(dav1d_w_avg, ssse3);
159 c->mask = BF(dav1d_mask, ssse3);
160 c->w_mask[0] = BF(dav1d_w_mask_444, ssse3);
161 c->w_mask[1] = BF(dav1d_w_mask_422, ssse3);
162 c->w_mask[2] = BF(dav1d_w_mask_420, ssse3);
163 c->blend = BF(dav1d_blend, ssse3);
164 c->blend_v = BF(dav1d_blend_v, ssse3);
165 c->blend_h = BF(dav1d_blend_h, ssse3);
166 c->warp8x8 = BF(dav1d_warp_affine_8x8, ssse3);
167 c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3);
168 c->emu_edge = BF(dav1d_emu_edge, ssse3);
169 c->resize = BF(dav1d_resize, ssse3);
170
171 if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
172 return;
173
174 #if BITDEPTH == 8
175 c->warp8x8 = BF(dav1d_warp_affine_8x8, sse4);
176 c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4);
177 #endif
178
179 #if ARCH_X86_64
180 if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
181 return;
182
183 init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
184 init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
185 init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
186 init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
187 init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
188 init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
189 init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
190 init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
191 init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
192 init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2);
193
194 init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
195 init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
196 init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
197 init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
198 init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2);
199 init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2);
200 init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2);
201 init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
202 init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
203 init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2);
204
205 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
206 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
207 init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
208 init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
209 init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
210 init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
211 init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
212 init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
213 init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
214 init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
215
216 init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
217 init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
218 init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
219 init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
220 init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
221 init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
222 init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
223 init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
224 init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
225 init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2);
226
227 c->avg = BF(dav1d_avg, avx2);
228 c->w_avg = BF(dav1d_w_avg, avx2);
229 c->mask = BF(dav1d_mask, avx2);
230 c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
231 c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
232 c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
233 c->blend = BF(dav1d_blend, avx2);
234 c->blend_v = BF(dav1d_blend_v, avx2);
235 c->blend_h = BF(dav1d_blend_h, avx2);
236 c->warp8x8 = BF(dav1d_warp_affine_8x8, avx2);
237 c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2);
238 c->emu_edge = BF(dav1d_emu_edge, avx2);
239 c->resize = BF(dav1d_resize, avx2);
240
241 if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
242 return;
243
244 init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
245 init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
246 init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
247 init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
248 init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
249 init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
250 init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
251 init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
252 init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
253 init_mc_fn (FILTER_2D_BILINEAR, bilin, avx512icl);
254
255 init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl);
256 init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
257 init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl);
258 init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
259 init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx512icl);
260 init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx512icl);
261 init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx512icl);
262 init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx512icl);
263 init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx512icl);
264 init_mct_fn(FILTER_2D_BILINEAR, bilin, avx512icl);
265
266 c->avg = BF(dav1d_avg, avx512icl);
267 c->w_avg = BF(dav1d_w_avg, avx512icl);
268 c->mask = BF(dav1d_mask, avx512icl);
269 c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl);
270 c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl);
271 c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl);
272 c->blend = BF(dav1d_blend, avx512icl);
273 c->blend_v = BF(dav1d_blend_v, avx512icl);
274 c->blend_h = BF(dav1d_blend_h, avx512icl);
275
276 if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
277 c->resize = BF(dav1d_resize, avx512icl);
278 c->warp8x8 = BF(dav1d_warp_affine_8x8, avx512icl);
279 c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
280 }
281 #endif
282 }
283