xref: /aosp_15_r20/external/libdav1d/src/x86/mc.h (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1 /*
2  * Copyright © 2018-2021, VideoLAN and dav1d authors
3  * Copyright © 2018-2021, Two Orioles, LLC
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "src/cpu.h"
29 #include "src/mc.h"
30 
31 #define decl_fn(type, name) \
32     decl_##type##_fn(BF(name, ssse3)); \
33     decl_##type##_fn(BF(name, avx2)); \
34     decl_##type##_fn(BF(name, avx512icl));
35 #define init_mc_fn(type, name, suffix) \
36     c->mc[type] = BF(dav1d_put_##name, suffix)
37 #define init_mct_fn(type, name, suffix) \
38     c->mct[type] = BF(dav1d_prep_##name, suffix)
39 #define init_mc_scaled_fn(type, name, suffix) \
40     c->mc_scaled[type] = BF(dav1d_put_##name, suffix)
41 #define init_mct_scaled_fn(type, name, suffix) \
42     c->mct_scaled[type] = BF(dav1d_prep_##name, suffix)
43 
44 decl_fn(mc, dav1d_put_8tap_regular);
45 decl_fn(mc, dav1d_put_8tap_regular_smooth);
46 decl_fn(mc, dav1d_put_8tap_regular_sharp);
47 decl_fn(mc, dav1d_put_8tap_smooth);
48 decl_fn(mc, dav1d_put_8tap_smooth_regular);
49 decl_fn(mc, dav1d_put_8tap_smooth_sharp);
50 decl_fn(mc, dav1d_put_8tap_sharp);
51 decl_fn(mc, dav1d_put_8tap_sharp_regular);
52 decl_fn(mc, dav1d_put_8tap_sharp_smooth);
53 decl_fn(mc, dav1d_put_bilin);
54 
55 decl_fn(mct, dav1d_prep_8tap_regular);
56 decl_fn(mct, dav1d_prep_8tap_regular_smooth);
57 decl_fn(mct, dav1d_prep_8tap_regular_sharp);
58 decl_fn(mct, dav1d_prep_8tap_smooth);
59 decl_fn(mct, dav1d_prep_8tap_smooth_regular);
60 decl_fn(mct, dav1d_prep_8tap_smooth_sharp);
61 decl_fn(mct, dav1d_prep_8tap_sharp);
62 decl_fn(mct, dav1d_prep_8tap_sharp_regular);
63 decl_fn(mct, dav1d_prep_8tap_sharp_smooth);
64 decl_fn(mct, dav1d_prep_bilin);
65 
66 decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular);
67 decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_smooth);
68 decl_fn(mc_scaled, dav1d_put_8tap_scaled_regular_sharp);
69 decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth);
70 decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_regular);
71 decl_fn(mc_scaled, dav1d_put_8tap_scaled_smooth_sharp);
72 decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp);
73 decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_regular);
74 decl_fn(mc_scaled, dav1d_put_8tap_scaled_sharp_smooth);
75 decl_fn(mc_scaled, dav1d_put_bilin_scaled);
76 
77 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular);
78 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_smooth);
79 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_regular_sharp);
80 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth);
81 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_regular);
82 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_smooth_sharp);
83 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp);
84 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_regular);
85 decl_fn(mct_scaled, dav1d_prep_8tap_scaled_sharp_smooth);
86 decl_fn(mct_scaled, dav1d_prep_bilin_scaled);
87 
88 decl_fn(avg, dav1d_avg);
89 decl_fn(w_avg, dav1d_w_avg);
90 decl_fn(mask, dav1d_mask);
91 decl_fn(w_mask, dav1d_w_mask_420);
92 decl_fn(w_mask, dav1d_w_mask_422);
93 decl_fn(w_mask, dav1d_w_mask_444);
94 decl_fn(blend, dav1d_blend);
95 decl_fn(blend_dir, dav1d_blend_v);
96 decl_fn(blend_dir, dav1d_blend_h);
97 
98 decl_fn(warp8x8, dav1d_warp_affine_8x8);
99 decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, sse4));
100 decl_fn(warp8x8t, dav1d_warp_affine_8x8t);
101 decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, sse4));
102 
103 decl_fn(emu_edge, dav1d_emu_edge);
104 
105 decl_fn(resize, dav1d_resize);
106 
mc_dsp_init_x86(Dav1dMCDSPContext * const c)107 static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) {
108     const unsigned flags = dav1d_get_cpu_flags();
109 
110     if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
111         return;
112 
113     init_mc_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        ssse3);
114     init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
115     init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  ssse3);
116     init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
117     init_mc_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         ssse3);
118     init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   ssse3);
119     init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  ssse3);
120     init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   ssse3);
121     init_mc_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          ssse3);
122     init_mc_fn(FILTER_2D_BILINEAR,            bilin,               ssse3);
123 
124     init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        ssse3);
125     init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
126     init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  ssse3);
127     init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
128     init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         ssse3);
129     init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   ssse3);
130     init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  ssse3);
131     init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   ssse3);
132     init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          ssse3);
133     init_mct_fn(FILTER_2D_BILINEAR,            bilin,               ssse3);
134 
135     init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        ssse3);
136     init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
137     init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  ssse3);
138     init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
139     init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         ssse3);
140     init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   ssse3);
141     init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  ssse3);
142     init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   ssse3);
143     init_mc_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          ssse3);
144     init_mc_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               ssse3);
145 
146     init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        ssse3);
147     init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
148     init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  ssse3);
149     init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
150     init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         ssse3);
151     init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   ssse3);
152     init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  ssse3);
153     init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   ssse3);
154     init_mct_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          ssse3);
155     init_mct_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               ssse3);
156 
157     c->avg = BF(dav1d_avg, ssse3);
158     c->w_avg = BF(dav1d_w_avg, ssse3);
159     c->mask = BF(dav1d_mask, ssse3);
160     c->w_mask[0] = BF(dav1d_w_mask_444, ssse3);
161     c->w_mask[1] = BF(dav1d_w_mask_422, ssse3);
162     c->w_mask[2] = BF(dav1d_w_mask_420, ssse3);
163     c->blend = BF(dav1d_blend, ssse3);
164     c->blend_v = BF(dav1d_blend_v, ssse3);
165     c->blend_h = BF(dav1d_blend_h, ssse3);
166     c->warp8x8  = BF(dav1d_warp_affine_8x8, ssse3);
167     c->warp8x8t = BF(dav1d_warp_affine_8x8t, ssse3);
168     c->emu_edge = BF(dav1d_emu_edge, ssse3);
169     c->resize = BF(dav1d_resize, ssse3);
170 
171     if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
172         return;
173 
174 #if BITDEPTH == 8
175     c->warp8x8  = BF(dav1d_warp_affine_8x8, sse4);
176     c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse4);
177 #endif
178 
179 #if ARCH_X86_64
180     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
181         return;
182 
183     init_mc_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
184     init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
185     init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx2);
186     init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
187     init_mc_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         avx2);
188     init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   avx2);
189     init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  avx2);
190     init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx2);
191     init_mc_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx2);
192     init_mc_fn(FILTER_2D_BILINEAR,            bilin,               avx2);
193 
194     init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
195     init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
196     init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx2);
197     init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
198     init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         avx2);
199     init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   avx2);
200     init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  avx2);
201     init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx2);
202     init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx2);
203     init_mct_fn(FILTER_2D_BILINEAR,            bilin,               avx2);
204 
205     init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        avx2);
206     init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
207     init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  avx2);
208     init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
209     init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         avx2);
210     init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   avx2);
211     init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  avx2);
212     init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   avx2);
213     init_mc_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          avx2);
214     init_mc_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               avx2);
215 
216     init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        avx2);
217     init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
218     init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  avx2);
219     init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
220     init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         avx2);
221     init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   avx2);
222     init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  avx2);
223     init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   avx2);
224     init_mct_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          avx2);
225     init_mct_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               avx2);
226 
227     c->avg = BF(dav1d_avg, avx2);
228     c->w_avg = BF(dav1d_w_avg, avx2);
229     c->mask = BF(dav1d_mask, avx2);
230     c->w_mask[0] = BF(dav1d_w_mask_444, avx2);
231     c->w_mask[1] = BF(dav1d_w_mask_422, avx2);
232     c->w_mask[2] = BF(dav1d_w_mask_420, avx2);
233     c->blend = BF(dav1d_blend, avx2);
234     c->blend_v = BF(dav1d_blend_v, avx2);
235     c->blend_h = BF(dav1d_blend_h, avx2);
236     c->warp8x8  = BF(dav1d_warp_affine_8x8, avx2);
237     c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx2);
238     c->emu_edge = BF(dav1d_emu_edge, avx2);
239     c->resize = BF(dav1d_resize, avx2);
240 
241     if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
242         return;
243 
244     init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx512icl);
245     init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
246     init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx512icl);
247     init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
248     init_mc_fn (FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         avx512icl);
249     init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   avx512icl);
250     init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  avx512icl);
251     init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx512icl);
252     init_mc_fn (FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx512icl);
253     init_mc_fn (FILTER_2D_BILINEAR,            bilin,               avx512icl);
254 
255     init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx512icl);
256     init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
257     init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx512icl);
258     init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
259     init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         avx512icl);
260     init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   avx512icl);
261     init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  avx512icl);
262     init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx512icl);
263     init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx512icl);
264     init_mct_fn(FILTER_2D_BILINEAR,            bilin,               avx512icl);
265 
266     c->avg = BF(dav1d_avg, avx512icl);
267     c->w_avg = BF(dav1d_w_avg, avx512icl);
268     c->mask = BF(dav1d_mask, avx512icl);
269     c->w_mask[0] = BF(dav1d_w_mask_444, avx512icl);
270     c->w_mask[1] = BF(dav1d_w_mask_422, avx512icl);
271     c->w_mask[2] = BF(dav1d_w_mask_420, avx512icl);
272     c->blend = BF(dav1d_blend, avx512icl);
273     c->blend_v = BF(dav1d_blend_v, avx512icl);
274     c->blend_h = BF(dav1d_blend_h, avx512icl);
275 
276     if (!(flags & DAV1D_X86_CPU_FLAG_SLOW_GATHER)) {
277         c->resize = BF(dav1d_resize, avx512icl);
278         c->warp8x8  = BF(dav1d_warp_affine_8x8, avx512icl);
279         c->warp8x8t = BF(dav1d_warp_affine_8x8t, avx512icl);
280     }
281 #endif
282 }
283