1 /*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Two Orioles, LLC
4 * Copyright © 2024, Luca Barbato
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 * list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include "config.h"
30
31 #undef NDEBUG
32 #include <assert.h>
33
34 #include <stdlib.h>
35
36 #include "common/attributes.h"
37 #include "common/intops.h"
38
39 #include "src/ppc/dav1d_types.h"
40 #include "src/ppc/loopfilter.h"
41
42 #if BITDEPTH == 8
43
44 #define LOAD4_H(idx) \
45 u8x16 idx##0 = vec_xl(0, dst); /* p1_0 p0_0 q0_0 q1_0 */ \
46 dst += stridea; \
47 u8x16 idx##1 = vec_xl(0, dst); /* p1_1 p0_1 q0_1 q1_1 */ \
48 dst += stridea; \
49 u8x16 idx##2 = vec_xl(0, dst); /* p1_2 p0_2 q0_2 q1_2 */ \
50 dst += stridea; \
51 u8x16 idx##3 = vec_xl(0, dst); /* p1_3 p0_3 q0_3 q1_3 */ \
52
53 // return idx##_01 and idx##_23
54 #define LOAD4_H_SINGLE(idx) \
55 LOAD4_H(idx) \
56 \
57 u8x16 idx##_01 = vec_mergeh(idx##0, idx##1); /* p1_0 p1_1 p0_0 p0_1 q0_0 q0_1 q1_0 q1_1 */ \
58 u8x16 idx##_23 = vec_mergeh(idx##2, idx##3); /* p1_2 p1_3 p0_2 p0_3 q0_2 q0_3 q1_2 q1_3 */
59
60
61 #define DECLARE_ADD_16HL(r, a, b) \
62 u16x8 r##h = vec_add(a##h, b##h); \
63 u16x8 r##l = vec_add(a##l, b##l);
64
65 #define ADD_16HL(r, a, b) \
66 r##h = vec_add(a##h, b##h); \
67 r##l = vec_add(a##l, b##l);
68
69 #define ADD_AND_SHIFT4(v) \
70 v##h = vec_sr(vec_add(v##h, v4u16), v3u16); \
71 v##l = vec_sr(vec_add(v##l, v4u16), v3u16);
72 #define ADD_AND_SHIFT8(v) \
73 v##h = vec_sr(vec_add(v##h, v8u16), v4u16); \
74 v##l = vec_sr(vec_add(v##l, v8u16), v4u16);
75
76 #define PACK_AND_SEL(v, m) \
77 vec_sel(v, vec_pack(o##v##h, o##v##l), m)
78
79 #define UNPACK_16(v) \
80 u16x8 v##h = u8h_to_u16(v); \
81 u16x8 v##l = u8l_to_u16(v);
82
83
84 #define APPLY_4 \
85 b8x16 hev = vec_cmpgt(max_a_p1p0_q1q0, H); \
86 \
87 i8x16 ps1 = (i8x16)vec_xor(p1, s); \
88 i8x16 ps0 = (i8x16)vec_xor(p0, s); \
89 i8x16 qs0 = (i8x16)vec_xor(q0, s); \
90 i8x16 qs1 = (i8x16)vec_xor(q1, s); \
91 i8x16 f0 = vec_and(vec_subs(ps1, qs1), hev); \
92 i16x8 q0sh = (i16x8)q0h; \
93 i16x8 q0sl = (i16x8)q0l; \
94 i16x8 p0sh = (i16x8)p0h; \
95 i16x8 p0sl = (i16x8)p0l; \
96 i16x8 f0h = i8h_to_i16(f0); \
97 i16x8 f0l = i8l_to_i16(f0); \
98 i16x8 d0h = vec_sub(q0sh, p0sh); \
99 i16x8 d0l = vec_sub(q0sl, p0sl); \
100 u8x16 v3u8 = vec_splat_u8(3); \
101 i16x8 d0h_2 = vec_add(d0h, d0h); \
102 i16x8 d0l_2 = vec_add(d0l, d0l); \
103 u8x16 v4u8 = vec_splat_u8(4); \
104 i16x8 f0_d0h = vec_add(d0h, f0h); \
105 i16x8 f0_d0l = vec_add(d0l, f0l); \
106 i16x8 fh = vec_add(d0h_2, f0_d0h); \
107 i16x8 fl = vec_add(d0l_2, f0_d0l); \
108 i8x16 f = vec_packs(fh, fl); \
109 i8x16 f1 = vec_adds(f, (i8x16)v4u8); \
110 i8x16 f2 = vec_adds(f, (i8x16)v3u8); \
111 f1 = vec_sra(f1, v3u8); \
112 f2 = vec_sra(f2, v3u8); \
113 f1 = vec_and(f1, fm); \
114 f2 = vec_and(f2, fm); \
115 i8x16 f3 = vec_adds(f1, (i8x16)v1u8); \
116 b8x16 m3 = vec_and(~hev, (b8x16)fm); \
117 f3 = vec_sra(f3, v1u8); \
118 f3 = vec_and(f3, m3); \
119 i8x16 op0s = vec_adds(ps0, f2); \
120 i8x16 oq0s = vec_subs(qs0, f1); \
121 i8x16 oq1s = vec_subs(qs1, f3); \
122 i8x16 op1s = vec_adds(ps1, f3); \
123 p0 = (u8x16)vec_xor(op0s, s); \
124 q0 = (u8x16)vec_xor(oq0s, s); \
125 q1 = (u8x16)vec_xor(oq1s, s); \
126 p1 = (u8x16)vec_xor(op1s, s);
127
128 #define APPLY_8 \
129 DECLARE_ADD_16HL(p1p0, p1, p0) \
130 DECLARE_ADD_16HL(p2q0, p2, q0) \
131 DECLARE_ADD_16HL(q1q2, q1, q2) \
132 DECLARE_ADD_16HL(p3p3, p3, p3) \
133 DECLARE_ADD_16HL(q0q3, q0, q3) \
134 DECLARE_ADD_16HL(p3p2, p3, p2) \
135 DECLARE_ADD_16HL(p1q1, p1, q1) \
136 DECLARE_ADD_16HL(p3p0, p3, p0) \
137 DECLARE_ADD_16HL(p0q2, p0, q2) \
138 DECLARE_ADD_16HL(q1q3, q1, q3) \
139 DECLARE_ADD_16HL(q3q3, q3, q3) \
140 DECLARE_ADD_16HL(q0q1q2q3, q0q3, q1q2) \
141 DECLARE_ADD_16HL(p2p1p0q0, p1p0, p2q0) \
142 DECLARE_ADD_16HL(p3p3p3p2, p3p3, p3p2) \
143 DECLARE_ADD_16HL(p3p3p1q1, p3p3, p1q1) \
144 DECLARE_ADD_16HL(p3p0q1q2, p3p0, q1q2) \
145 DECLARE_ADD_16HL(p1p0q1q3, p1p0, q1q3) \
146 DECLARE_ADD_16HL(p0q2q3q3, p0q2, q3q3) \
147 \
148 DECLARE_ADD_16HL(op2, p3p3p3p2, p2p1p0q0) \
149 DECLARE_ADD_16HL(op1, p3p3p1q1, p2p1p0q0) \
150 DECLARE_ADD_16HL(op0, p3p0q1q2, p2p1p0q0) \
151 DECLARE_ADD_16HL(oq0, p2p1p0q0, q0q1q2q3) \
152 DECLARE_ADD_16HL(oq1, p1p0q1q3, q0q1q2q3) \
153 DECLARE_ADD_16HL(oq2, p0q2q3q3, q0q1q2q3) \
154 \
155 ADD_AND_SHIFT4(op2) \
156 ADD_AND_SHIFT4(op1) \
157 ADD_AND_SHIFT4(op0) \
158 ADD_AND_SHIFT4(oq0) \
159 ADD_AND_SHIFT4(oq1) \
160 ADD_AND_SHIFT4(oq2) \
161 \
162 p2 = PACK_AND_SEL(p2, apply_8); \
163 p1 = PACK_AND_SEL(p1, apply_8); \
164 p0 = PACK_AND_SEL(p0, apply_8); \
165 q0 = PACK_AND_SEL(q0, apply_8); \
166 q1 = PACK_AND_SEL(q1, apply_8); \
167 q2 = PACK_AND_SEL(q2, apply_8);
168
169 #define APPLY_16 \
170 DECLARE_ADD_16HL(p6p6, p6, p6) \
171 DECLARE_ADD_16HL(p6p5, p6, p5) \
172 DECLARE_ADD_16HL(p6p4, p6, p4) \
173 DECLARE_ADD_16HL(p4p3, p4, p3) \
174 DECLARE_ADD_16HL(p2p1, p2, p1) \
175 DECLARE_ADD_16HL(p2q2, p2, q2) \
176 DECLARE_ADD_16HL(p3q1, p3, q1) \
177 DECLARE_ADD_16HL(p0q0, p0, q0) \
178 DECLARE_ADD_16HL(p0q1, p0, q1) \
179 DECLARE_ADD_16HL(p1q3, p1, q3) \
180 DECLARE_ADD_16HL(p1q0, p1, q0) \
181 DECLARE_ADD_16HL(p1q5, p1, q5) \
182 DECLARE_ADD_16HL(q3q4, q3, q4) \
183 DECLARE_ADD_16HL(q2q5, q2, q5) \
184 DECLARE_ADD_16HL(q1q6, q1, q6) \
185 DECLARE_ADD_16HL(q0q1, q0, q1) \
186 DECLARE_ADD_16HL(q6q6, q6, q6) \
187 DECLARE_ADD_16HL(q2q6, q2, q6) \
188 DECLARE_ADD_16HL(q3q6, q3, q6) \
189 DECLARE_ADD_16HL(q4q6, q4, q6) \
190 DECLARE_ADD_16HL(p5q0, p5, q0) \
191 \
192 DECLARE_ADD_16HL(p6q2, p6, q2) \
193 DECLARE_ADD_16HL(p6p6p6p4, p6p6, p6p4) \
194 DECLARE_ADD_16HL(p6p5p2p1, p6p5, p2p1) \
195 DECLARE_ADD_16HL(p4p3p0q0, p4p3, p0q0) \
196 DECLARE_ADD_16HL(p2q2p3q1, p2q2, p3q1) \
197 DECLARE_ADD_16HL(p6p5p6p6, p6p5, p6p6) \
198 DECLARE_ADD_16HL(p6p5p3q1, p6p5, p3q1) \
199 DECLARE_ADD_16HL(p6p6p1q3, p6p6, p1q3) \
200 DECLARE_ADD_16HL(q2q5q3q4, q2q5, q3q4) \
201 DECLARE_ADD_16HL(p2p1q1q6, p2p1, q1q6) \
202 DECLARE_ADD_16HL(p0q0q3q6, p0q0, q3q6) \
203 DECLARE_ADD_16HL(q4q6q6q6, q4q6, q6q6) \
204 u16x8 q5q6q6q6h = vec_madd(v3u16, q6h, q5h); \
205 u16x8 q5q6q6q6l = vec_madd(v3u16, q6l, q5l); \
206 DECLARE_ADD_16HL(p0q0q1q6, p0q0, q1q6) \
207 DECLARE_ADD_16HL(p0q1q3q4, p0q1, q3q4) \
208 \
209 DECLARE_ADD_16HL(p6q2p2p1, p6q2, p2p1) \
210 DECLARE_ADD_16HL(p1q0q2q5, p1q0, q2q5) \
211 DECLARE_ADD_16HL(p0q1p5q0, p0q1, p5q0) \
212 DECLARE_ADD_16HL(q0q1q2q6, q0q1, q2q6) \
213 DECLARE_ADD_16HL(p3q1q2q6, p3q1, q2q6) \
214 DECLARE_ADD_16HL(q2q6q4q6, q2q6, q4q6) \
215 DECLARE_ADD_16HL(q3q6p1q5, q3q6, p1q5) \
216 \
217 DECLARE_ADD_16HL(p4p3p0q0p2p1q1q6, p4p3p0q0, p2p1q1q6) \
218 DECLARE_ADD_16HL(p6p5p2p1p4p3p0q0, p6p5p2p1, p4p3p0q0) \
219 DECLARE_ADD_16HL(p2p1q1q6q2q5q3q4, p2p1q1q6, q2q5q3q4) \
220 DECLARE_ADD_16HL(q2q5q3q4q4q6q6q6, q2q5q3q4, q4q6q6q6) \
221 DECLARE_ADD_16HL(p6p5p2p1p4p3p0q0p2q2p3q1, p6p5p2p1p4p3p0q0, p2q2p3q1) \
222 DECLARE_ADD_16HL(p6p6p6p4p6p5p2p1p4p3p0q0, p6p6p6p4, p6p5p2p1p4p3p0q0) \
223 DECLARE_ADD_16HL(p4p3p0q0p2p1q1q6q2q5q3q4, p4p3p0q0p2p1q1q6, q2q5q3q4) \
224 DECLARE_ADD_16HL(p2p1q1q6q2q5q3q4p0q0q3q6, p2p1q1q6q2q5q3q4, p0q0q3q6) \
225 DECLARE_ADD_16HL(p0q0q1q6q2q5q3q4q4q6q6q6, p0q0q1q6, q2q5q3q4q4q6q6q6) \
226 DECLARE_ADD_16HL(p6p5p2p1p4p3p0q0p0q1q3q4, p6p5p2p1p4p3p0q0, p0q1q3q4) \
227 \
228 DECLARE_ADD_16HL(op5, p6p6p6p4p6p5p2p1p4p3p0q0, p6p5p6p6) \
229 DECLARE_ADD_16HL(op4, p6p6p6p4p6p5p2p1p4p3p0q0, p6p5p3q1) \
230 DECLARE_ADD_16HL(op3, p6p6p6p4, p6p5p2p1p4p3p0q0p2q2p3q1) \
231 DECLARE_ADD_16HL(op2, p6p6p1q3, p6p5p2p1p4p3p0q0p2q2p3q1) \
232 DECLARE_ADD_16HL(op1, p6p5p2p1p4p3p0q0p0q1q3q4, p6q2p2p1) \
233 DECLARE_ADD_16HL(op0, p6p5p2p1p4p3p0q0p0q1q3q4, p1q0q2q5) \
234 DECLARE_ADD_16HL(oq0, p4p3p0q0p2p1q1q6q2q5q3q4, p0q1p5q0) \
235 DECLARE_ADD_16HL(oq1, p4p3p0q0p2p1q1q6q2q5q3q4, q0q1q2q6) \
236 DECLARE_ADD_16HL(oq2, p2p1q1q6q2q5q3q4p0q0q3q6, p3q1q2q6) \
237 DECLARE_ADD_16HL(oq3, p2p1q1q6q2q5q3q4p0q0q3q6, q2q6q4q6) \
238 DECLARE_ADD_16HL(oq4, p0q0q1q6q2q5q3q4q4q6q6q6, q3q6p1q5) \
239 DECLARE_ADD_16HL(oq5, p0q0q1q6q2q5q3q4q4q6q6q6, q5q6q6q6) \
240 \
241 ADD_AND_SHIFT8(op5) \
242 ADD_AND_SHIFT8(op4) \
243 ADD_AND_SHIFT8(op3) \
244 ADD_AND_SHIFT8(op2) \
245 ADD_AND_SHIFT8(op1) \
246 ADD_AND_SHIFT8(op0) \
247 ADD_AND_SHIFT8(oq0) \
248 ADD_AND_SHIFT8(oq1) \
249 ADD_AND_SHIFT8(oq2) \
250 ADD_AND_SHIFT8(oq3) \
251 ADD_AND_SHIFT8(oq4) \
252 ADD_AND_SHIFT8(oq5) \
253 \
254 p5 = PACK_AND_SEL(p5, apply_16); \
255 p4 = PACK_AND_SEL(p4, apply_16); \
256 p3 = PACK_AND_SEL(p3, apply_16); \
257 p2 = PACK_AND_SEL(p2, apply_16); \
258 p1 = PACK_AND_SEL(p1, apply_16); \
259 p0 = PACK_AND_SEL(p0, apply_16); \
260 q0 = PACK_AND_SEL(q0, apply_16); \
261 q1 = PACK_AND_SEL(q1, apply_16); \
262 q2 = PACK_AND_SEL(q2, apply_16); \
263 q3 = PACK_AND_SEL(q3, apply_16); \
264 q4 = PACK_AND_SEL(q4, apply_16); \
265 q5 = PACK_AND_SEL(q5, apply_16); \
266
267
268
store_h_4(u8x16 out,uint8_t * dst,int stridea)269 static inline void store_h_4(u8x16 out, uint8_t *dst, int stridea)
270 {
271 u8x16 out1 = (u8x16)vec_splat((u32x4)out, 1);
272 u8x16 out2 = (u8x16)vec_splat((u32x4)out, 2);
273 u8x16 out3 = (u8x16)vec_splat((u32x4)out, 3);
274 vec_xst_len(out, dst, 4);
275 dst += stridea;
276 vec_xst_len(out1, dst, 4);
277 dst += stridea;
278 vec_xst_len(out2, dst, 4);
279 dst += stridea;
280 vec_xst_len(out3, dst, 4);
281 }
282
store_h_8(u8x16 outa,u8x16 outb,uint8_t * dst,int stridea)283 static inline void store_h_8(u8x16 outa, u8x16 outb, uint8_t *dst, int stridea)
284 {
285 u8x16 out1 = (u8x16)vec_mergel((u64x2)outa, (u64x2)outa);
286 u8x16 out3 = (u8x16)vec_mergel((u64x2)outb, (u64x2)outb);
287 vec_xst_len(outa, dst, 6);
288 dst += stridea;
289 vec_xst_len(out1, dst, 6);
290 dst += stridea;
291 vec_xst_len(outb, dst, 6);
292 dst += stridea;
293 vec_xst_len(out3, dst, 6);
294 }
295
296 // Assume a layout {v}0 {v}1 {v}2 {v}3, produces {v}01 {v}23
297 #define MERGEH_4(v) \
298 u8x16 v##01 = vec_mergeh(v##0, v##1); \
299 u8x16 v##23 = vec_mergeh(v##2, v##3);
300
301 #define MERGEL_4(v) \
302 u8x16 v##01 = vec_mergel(v##0, v##1); \
303 u8x16 v##23 = vec_mergel(v##2, v##3);
304
305 // produce {v}0123h
306 #define MERGEH_U16_0123(v) \
307 u16x8 v##0123h = vec_mergeh((u16x8)v##01, (u16x8)v##23);
308
309 #define MERGEHL_U16_0123(v) \
310 u16x8 v##0123l = vec_mergel((u16x8)v##01, (u16x8)v##23);
311
312 #define MERGE_U16_0123(v) \
313 u16x8 v##0123h = vec_mergeh((u16x8)v##01, (u16x8)v##23); \
314 u16x8 v##0123l = vec_mergel((u16x8)v##01, (u16x8)v##23);
315
316 // produce {ac,bd}0123h{dir}
317 #define MERGEH_U32_LINE(dir) \
318 u32x4 ac0123h##dir = vec_mergeh((u32x4)a0123##dir, (u32x4)c0123##dir); \
319 u32x4 bd0123h##dir = vec_mergeh((u32x4)b0123##dir, (u32x4)d0123##dir);
320
321 #define MERGEL_U32_LINE(dir) \
322 u32x4 ac0123l##dir = vec_mergel((u32x4)a0123##dir, (u32x4)c0123##dir); \
323 u32x4 bd0123l##dir = vec_mergel((u32x4)b0123##dir, (u32x4)d0123##dir);
324
325
326 // produce the pair of mergeh/mergel of {ac,bd}01234{dira}{dirb}
327 #define MERGE_U32(oh, ol, dira, dirb) \
328 oh = (u8x16)vec_mergeh(ac0123##dira##dirb, bd0123##dira##dirb); \
329 ol = (u8x16)vec_mergel(ac0123##dira##dirb, bd0123##dira##dirb);
330
331 #define MERGEHL_U8(a, b) \
332 u8x16 a##b##h = vec_mergeh(a, b); \
333 u8x16 a##b##l = vec_mergel(a, b);
334
335 #define MERGEHL_U16(out, a, b) \
336 u8x16 out##h = (u8x16)vec_mergeh((u16x8)a, (u16x8)b); \
337 u8x16 out##l = (u8x16)vec_mergel((u16x8)a, (u16x8)b);
338
339 #define MERGEHL_U32(out, a, b) \
340 u8x16 out##h = (u8x16)vec_mergeh((u32x4)a, (u32x4)b); \
341 u8x16 out##l = (u8x16)vec_mergel((u32x4)a, (u32x4)b);
342
343 static inline void
loop_filter_h_4_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t stridea,b32x4 apply)344 loop_filter_h_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
345 const ptrdiff_t stridea, b32x4 apply)
346 {
347 dst -= 2;
348 uint8_t *dst2 = dst;
349 u8x16 p1, p0, q0, q1;
350
351 LOAD4_H(a)
352 dst += stridea;
353 LOAD4_H(b)
354 dst += stridea;
355 LOAD4_H(c)
356 dst += stridea;
357 LOAD4_H(d)
358
359 MERGEH_4(a)
360 MERGEH_4(b)
361 MERGEH_4(c)
362 MERGEH_4(d)
363
364 MERGEH_U16_0123(a)
365 MERGEH_U16_0123(b)
366 MERGEH_U16_0123(c)
367 MERGEH_U16_0123(d)
368
369 MERGEH_U32_LINE(h)
370 MERGEL_U32_LINE(h)
371
372 MERGE_U32(p1, p0, h, h)
373 MERGE_U32(q0, q1, l, h)
374
375 const u8x16 zero = vec_splat_u8(0);
376 const u8x16 v1u8 = vec_splat_u8(1);
377 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
378
379 const u8x16 a_p1_p0 = vec_absd(p1, p0);
380 const u8x16 a_q1_q0 = vec_absd(q1, q0);
381 const u8x16 a_p0_q0 = vec_absd(p0, q0);
382 const u8x16 a_p1_q1 = vec_absd(p1, q1);
383
384 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
385 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
386 const u8x16 cmp_I = max_a_p1p0_q1q0;
387 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
388 const b8x16 ltI = vec_cmple(cmp_I, I);
389 const b8x16 ltE = vec_cmple(cmp_E, E);
390 b8x16 fm = vec_and(ltI, ltE);
391
392 fm = vec_and(fm, (b8x16)apply);
393 if (vec_all_eq(fm, zero))
394 return;
395
396 UNPACK_16(p0)
397 UNPACK_16(q0)
398
399 APPLY_4
400
401 u8x16 p1p0ab = (u8x16)vec_mergeh(p1, p0); // p1 p0 ...
402 u8x16 q0q1ab = (u8x16)vec_mergeh(q0, q1); // q0 q1 ...
403 u8x16 p1p0cd = (u8x16)vec_mergel(p1, p0); // p1 p0 ...
404 u8x16 q0q1cd = (u8x16)vec_mergel(q0, q1); // q0 q1 ...
405
406 u8x16 outa = (u8x16)vec_mergeh((u16x8)p1p0ab, (u16x8)q0q1ab); // op1 op0 oq0 oq1 ...
407 u8x16 outb = (u8x16)vec_mergel((u16x8)p1p0ab, (u16x8)q0q1ab);
408 u8x16 outc = (u8x16)vec_mergeh((u16x8)p1p0cd, (u16x8)q0q1cd);
409 u8x16 outd = (u8x16)vec_mergel((u16x8)p1p0cd, (u16x8)q0q1cd);
410
411 if (apply[0]) {
412 store_h_4(outa, dst2, stridea);
413 }
414 dst2 += 4 * stridea;
415 if (apply[1]) {
416 store_h_4(outb, dst2, stridea);
417 }
418 dst2 += 4 * stridea;
419 if (apply[2]) {
420 store_h_4(outc, dst2, stridea);
421 }
422 dst2 += 4 * stridea;
423 if (apply[3]) {
424 store_h_4(outd, dst2, stridea);
425 }
426 }
427
428 static inline void
loop_filter_h_6_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t stridea,b32x4 apply,b32x4 m6)429 loop_filter_h_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
430 const ptrdiff_t stridea, b32x4 apply, b32x4 m6)
431 {
432 uint8_t *dst2 = dst - 2;
433 dst -= 3;
434 u8x16 p2, p1, p0, q0, q1, q2;
435
436 LOAD4_H(a)
437 dst += stridea;
438 LOAD4_H(b)
439 dst += stridea;
440 LOAD4_H(c)
441 dst += stridea;
442 LOAD4_H(d)
443
444 MERGEH_4(a)
445 MERGEH_4(b)
446 MERGEH_4(c)
447 MERGEH_4(d)
448
449 MERGE_U16_0123(a)
450 MERGE_U16_0123(b)
451 MERGE_U16_0123(c)
452 MERGE_U16_0123(d)
453
454 MERGEH_U32_LINE(h)
455 MERGEL_U32_LINE(h)
456 MERGEH_U32_LINE(l)
457
458 MERGE_U32(p2, p1, h, h)
459 MERGE_U32(p0, q0, l, h)
460 MERGE_U32(q1, q2, h, l)
461
462 const u8x16 F = vec_splat_u8(1);
463
464 const u8x16 zero = vec_splat_u8(0);
465 const u16x8 v3u16 = vec_splat_u16(3);
466 const u16x8 v4u16 = vec_splat_u16(4);
467 const u8x16 v1u8 = vec_splat_u8(1);
468 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
469
470 const u8x16 a_p1_p0 = vec_absd(p1, p0);
471 const u8x16 a_q1_q0 = vec_absd(q1, q0);
472 const u8x16 a_p0_q0 = vec_absd(p0, q0);
473 const u8x16 a_p1_q1 = vec_absd(p1, q1);
474 const u8x16 a_p2_p1 = vec_absd(p2, p1);
475 const u8x16 a_q2_q1 = vec_absd(q2, q1);
476 const u8x16 a_p2_p0 = vec_absd(p2, p0);
477 const u8x16 a_q2_q0 = vec_absd(q2, q0);
478
479 u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1);
480 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
481 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
482 u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0);
483 u8x16 cmp_I_m6 = max_a_p2p1_q2q1;
484 u8x16 cmp_I_m4 = max_a_p1p0_q1q0;
485 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
486 cmp_I_m6 = vec_and(cmp_I_m6, (u8x16)m6);
487 u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m6);
488 const b8x16 ltE = vec_cmple(cmp_E, E);
489 const b8x16 ltI = vec_cmple(cmp_I, I);
490 b8x16 fm = vec_and(ltI, ltE);
491
492 fm = vec_and(fm, (b8x16)apply);
493 if (vec_all_eq(fm, zero))
494 return;
495
496 UNPACK_16(p2)
497 UNPACK_16(p1)
498 UNPACK_16(p0)
499 UNPACK_16(q0)
500 UNPACK_16(q1)
501 UNPACK_16(q2)
502
503 m6 = vec_and(m6, (b32x4)fm);
504
505 u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0);
506 b8x16 apply_6 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m6);
507
508 b8x16 apply_4 = vec_andc(fm, apply_6);
509
510 if (vec_any_ne(apply_4, zero)) {
511 APPLY_4
512 }
513
514 if (vec_any_ne(apply_6, zero)) {
515 DECLARE_ADD_16HL(p2p2, p2, p2)
516 DECLARE_ADD_16HL(p2p1, p2, p1)
517 DECLARE_ADD_16HL(p1p0, p1, p0)
518 DECLARE_ADD_16HL(p0q0, p0, q0)
519 DECLARE_ADD_16HL(q0q1, q0, q1)
520 DECLARE_ADD_16HL(q1q2, q1, q2)
521 DECLARE_ADD_16HL(p2p2p0q0, p2p2, p0q0)
522 DECLARE_ADD_16HL(p2p1p1p0, p2p1, p1p0)
523 DECLARE_ADD_16HL(p1p0q1q2, p1p0, q1q2)
524 DECLARE_ADD_16HL(p0q0q0q1, p0q0, q0q1)
525 u16x8 q1q2q2q2h = q2h * 3 + q1h;
526 u16x8 q1q2q2q2l = q2l * 3 + q1l;
527
528 DECLARE_ADD_16HL(op1, p2p2p0q0, p2p1p1p0)
529 DECLARE_ADD_16HL(op0, p2p1p1p0, p0q0q0q1)
530 DECLARE_ADD_16HL(oq0, p1p0q1q2, p0q0q0q1)
531 DECLARE_ADD_16HL(oq1, p0q0q0q1, q1q2q2q2)
532
533 ADD_AND_SHIFT4(op1)
534 ADD_AND_SHIFT4(op0)
535 ADD_AND_SHIFT4(oq0)
536 ADD_AND_SHIFT4(oq1)
537
538 p1 = PACK_AND_SEL(p1, apply_6);
539 p0 = PACK_AND_SEL(p0, apply_6);
540 q0 = PACK_AND_SEL(q0, apply_6);
541 q1 = PACK_AND_SEL(q1, apply_6);
542 }
543
544 u8x16 p1p0ab = (u8x16)vec_mergeh(p1, p0); // p1 p0 ...
545 u8x16 q0q1ab = (u8x16)vec_mergeh(q0, q1); // q0 q1 ...
546 u8x16 p1p0cd = (u8x16)vec_mergel(p1, p0); // p1 p0 ...
547 u8x16 q0q1cd = (u8x16)vec_mergel(q0, q1); // q0 q1 ...
548
549 u8x16 outa = (u8x16)vec_mergeh((u16x8)p1p0ab, (u16x8)q0q1ab); // op1 op0 oq0 oq1 ...
550 u8x16 outb = (u8x16)vec_mergel((u16x8)p1p0ab, (u16x8)q0q1ab);
551 u8x16 outc = (u8x16)vec_mergeh((u16x8)p1p0cd, (u16x8)q0q1cd);
552 u8x16 outd = (u8x16)vec_mergel((u16x8)p1p0cd, (u16x8)q0q1cd);
553
554 if (apply[0]) {
555 store_h_4(outa, dst2, stridea);
556 }
557 dst2 += 4 * stridea;
558 if (apply[1]) {
559 store_h_4(outb, dst2, stridea);
560 }
561 dst2 += 4 * stridea;
562 if (apply[2]) {
563 store_h_4(outc, dst2, stridea);
564 }
565 dst2 += 4 * stridea;
566 if (apply[3]) {
567 store_h_4(outd, dst2, stridea);
568 }
569 }
570
571 static inline void
loop_filter_h_8_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t stridea,b32x4 apply,b32x4 m8)572 loop_filter_h_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
573 const ptrdiff_t stridea, b32x4 apply, b32x4 m8)
574 {
575 uint8_t *dst2 = dst - 3;
576 dst -= 4;
577 u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
578
579 LOAD4_H(a)
580 dst += stridea;
581 LOAD4_H(b)
582 dst += stridea;
583 LOAD4_H(c)
584 dst += stridea;
585 LOAD4_H(d)
586
587 MERGEH_4(a)
588 MERGEH_4(b)
589 MERGEH_4(c)
590 MERGEH_4(d)
591
592 MERGE_U16_0123(a)
593 MERGE_U16_0123(b)
594 MERGE_U16_0123(c)
595 MERGE_U16_0123(d)
596
597 MERGEH_U32_LINE(h)
598 MERGEL_U32_LINE(h)
599 MERGEH_U32_LINE(l)
600 MERGEL_U32_LINE(l)
601
602 MERGE_U32(p3, p2, h, h)
603 MERGE_U32(p1, p0, l, h)
604 MERGE_U32(q0, q1, h, l)
605 MERGE_U32(q2, q3, l, l)
606
607 const u8x16 F = vec_splat_u8(1);
608
609 const u8x16 zero = vec_splat_u8(0);
610 const u16x8 v3u16 = vec_splat_u16(3);
611 const u16x8 v4u16 = vec_splat_u16(4);
612 const u8x16 v1u8 = vec_splat_u8(1);
613 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
614
615 const u8x16 a_p1_p0 = vec_absd(p1, p0);
616 const u8x16 a_q1_q0 = vec_absd(q1, q0);
617 const u8x16 a_p0_q0 = vec_absd(p0, q0);
618 const u8x16 a_p1_q1 = vec_absd(p1, q1);
619 const u8x16 a_p2_p1 = vec_absd(p2, p1);
620 const u8x16 a_q2_q1 = vec_absd(q2, q1);
621 const u8x16 a_p2_p0 = vec_absd(p2, p0);
622 const u8x16 a_q2_q0 = vec_absd(q2, q0);
623 const u8x16 a_p3_p0 = vec_absd(p3, p0);
624 const u8x16 a_q3_q0 = vec_absd(q3, q0);
625 const u8x16 a_p3_p2 = vec_absd(p3, p2);
626 const u8x16 a_q3_q2 = vec_absd(q3, q2);
627
628 u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1);
629 u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2);
630 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
631 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
632 const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0);
633 u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0);
634 u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2);
635 u8x16 cmp_I_m4 = max_a_p1p0_q1q0;
636 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
637 cmp_I_m8 = vec_and(cmp_I_m8, (u8x16)m8);
638 u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8);
639 const b8x16 ltE = vec_cmple(cmp_E, E);
640 const b8x16 ltI = vec_cmple(cmp_I, I);
641 b8x16 fm = vec_and(ltI, ltE);
642
643 fm = vec_and(fm, (b8x16)apply);
644 if (vec_all_eq(fm, zero))
645 return;
646
647 #define UNPACK_16(v) \
648 u16x8 v##h = u8h_to_u16(v); \
649 u16x8 v##l = u8l_to_u16(v);
650
651 UNPACK_16(p3)
652 UNPACK_16(p2)
653 UNPACK_16(p1)
654 UNPACK_16(p0)
655 UNPACK_16(q0)
656 UNPACK_16(q1)
657 UNPACK_16(q2)
658 UNPACK_16(q3)
659
660 m8 = vec_and(m8, (b32x4)fm);
661
662 u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0);
663 cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in);
664 b8x16 apply_8 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m8);
665
666 b8x16 apply_4 = vec_andc(fm, apply_8);
667
668 if (vec_any_ne(apply_4, zero)) {
669 APPLY_4
670 }
671
672 if (vec_any_ne(apply_8, zero)) {
673 APPLY_8
674 }
675
676 MERGEHL_U8(p2, p1) // A0 A1 A2 A3 B0 B1 B2 B3
677 MERGEHL_U8(p0, q0)
678 MERGEHL_U8(q1, q2)
679
680 MERGEHL_U16(ab_p2p1p0q0, p2p1h, p0q0h) // A0 p2 p1 p0 q0 | A1 p2 p1 p0 q0 | A2 ...
681 // B0 ...
682 MERGEHL_U16(cd_p2p1p0q0, p2p1l, p0q0l) // C0 ...
683 // D0 ...
684 MERGEHL_U16(ab_q1q2, q1q2h, q1q2h) // A0 q1 q2 q1 q2 | A1 q1 q2 ...
685 // B0 ...
686 MERGEHL_U16(cd_q1q2, q1q2l, q1q2l) // C0 ...
687 // D0 ...
688
689 MERGEHL_U32(a, ab_p2p1p0q0h, ab_q1q2h) // A0 p2 p1 p0 q0 q1 q2 q1 q2 | A1 ..
690 // A2 ... | A3 ...
691 MERGEHL_U32(b, ab_p2p1p0q0l, ab_q1q2l) // B0 ...
692 // C2 ...
693 MERGEHL_U32(c, cd_p2p1p0q0h, cd_q1q2h) // C0 ...
694 // C2
695 MERGEHL_U32(d, cd_p2p1p0q0l, cd_q1q2l) // D0 ..
696 // D2 ..
697 if (apply[0]) {
698 store_h_8(ah, al, dst2, stridea);
699 }
700 dst2 += 4 * stridea;
701 if (apply[1]) {
702 store_h_8(bh, bl, dst2, stridea);
703 }
704 dst2 += 4 * stridea;
705
706 if (apply[2]) {
707 store_h_8(ch, cl, dst2, stridea);
708 }
709 dst2 += 4 * stridea;
710 if (apply[3]) {
711 store_h_8(dh, dl, dst2, stridea);
712 }
713
714 }
715
716 static inline void
loop_filter_h_16_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t stridea,b32x4 apply,b32x4 m8,b32x4 m16)717 loop_filter_h_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
718 const ptrdiff_t stridea, b32x4 apply, b32x4 m8, b32x4 m16)
719 {
720 uint8_t *dst2 = dst -6 ;
721 dst -= 7;
722 u8x16 p3, p2, p1, p0, q0, q1, q2, q3;
723 u8x16 p6, p5, p4, q4, q5, q6;
724
725 LOAD4_H(a)
726 dst += stridea;
727 LOAD4_H(b)
728 dst += stridea;
729 LOAD4_H(c)
730 dst += stridea;
731 LOAD4_H(d)
732
733 {
734 MERGEH_4(a)
735 MERGEH_4(b)
736 MERGEH_4(c)
737 MERGEH_4(d)
738
739 MERGE_U16_0123(a)
740 MERGE_U16_0123(b)
741 MERGE_U16_0123(c)
742 MERGE_U16_0123(d)
743
744 MERGEH_U32_LINE(h)
745 MERGEL_U32_LINE(h)
746 MERGEH_U32_LINE(l)
747 MERGEL_U32_LINE(l)
748
749 MERGE_U32(p6, p5, h, h)
750 MERGE_U32(p4, p3, l, h)
751 MERGE_U32(p2, p1, h, l)
752 MERGE_U32(p0, q0, l, l)
753 }
754 {
755 MERGEL_4(a)
756 MERGEL_4(b)
757 MERGEL_4(c)
758 MERGEL_4(d)
759
760 MERGE_U16_0123(a)
761 MERGE_U16_0123(b)
762 MERGE_U16_0123(c)
763 MERGE_U16_0123(d)
764
765 MERGEH_U32_LINE(h)
766 MERGEL_U32_LINE(h)
767 MERGEH_U32_LINE(l)
768
769 MERGE_U32(q1, q2, h, h)
770 MERGE_U32(q3, q4, l, h)
771 MERGE_U32(q5, q6, h, l)
772 }
773
774 const u8x16 F = vec_splat_u8(1);
775
776 const u8x16 zero = vec_splat_u8(0);
777 const u16x8 v3u16 = vec_splat_u16(3);
778 const u16x8 v4u16 = vec_splat_u16(4);
779 const u16x8 v8u16 = vec_splat_u16(8);
780 const u8x16 v1u8 = vec_splat_u8(1);
781 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
782
783 const u8x16 a_p6_p0 = vec_absd(p6, p0);
784 const u8x16 a_p5_p0 = vec_absd(p5, p0);
785 const u8x16 a_p4_p0 = vec_absd(p4, p0);
786 const u8x16 a_q4_q0 = vec_absd(q4, q0);
787 const u8x16 a_q5_q0 = vec_absd(q5, q0);
788 const u8x16 a_q6_q0 = vec_absd(q6, q0);
789
790 const u8x16 a_p1_p0 = vec_absd(p1, p0);
791 const u8x16 a_q1_q0 = vec_absd(q1, q0);
792 const u8x16 a_p0_q0 = vec_absd(p0, q0);
793 const u8x16 a_p1_q1 = vec_absd(p1, q1);
794 const u8x16 a_p2_p1 = vec_absd(p2, p1);
795 const u8x16 a_q2_q1 = vec_absd(q2, q1);
796 const u8x16 a_p2_p0 = vec_absd(p2, p0);
797 const u8x16 a_q2_q0 = vec_absd(q2, q0);
798 const u8x16 a_p3_p0 = vec_absd(p3, p0);
799 const u8x16 a_q3_q0 = vec_absd(q3, q0);
800 const u8x16 a_p3_p2 = vec_absd(p3, p2);
801 const u8x16 a_q3_q2 = vec_absd(q3, q2);
802
803 u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1);
804 u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2);
805 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
806 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
807 const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0);
808
809 const u8x16 max_a_p4p0_q4q0 = vec_max(a_p4_p0, a_q4_q0);
810 const u8x16 max_a_p5p0_q5q0 = vec_max(a_p5_p0, a_q5_q0);
811 const u8x16 max_a_p6p0_q6q0 = vec_max(a_p6_p0, a_q6_q0);
812
813 b32x4 m8_16 = vec_or(m8, m16);
814
815 u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0);
816 u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2);
817 u8x16 cmp_I_m4 = max_a_p1p0_q1q0;
818 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
819 cmp_I_m8 = vec_and(cmp_I_m8, (b8x16)m8_16);
820 u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8);
821 const b8x16 ltE = vec_cmple(cmp_E, E);
822 const b8x16 ltI = vec_cmple(cmp_I, I);
823 b8x16 fm = vec_and(ltI, ltE);
824
825 fm = vec_and(fm, (b8x16)apply);
826 if (vec_all_eq(fm, zero))
827 return;
828
829 u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0);
830 u8x16 cmp_flat8out = vec_max(max_a_p6p0_q6q0, max_a_p5p0_q5q0);
831
832 m8_16 = vec_and(m8_16, (b32x4)fm);
833 m16 = vec_and(m16, (b32x4)fm);
834
835 cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in);
836 cmp_flat8out = vec_max(max_a_p4p0_q4q0, cmp_flat8out);
837 b8x16 flat8in = vec_cmple(cmp_flat8in, F);
838 b8x16 flat8out = vec_cmple(cmp_flat8out, F);
839 flat8in = vec_and(flat8in, (b8x16)m8_16);
840 flat8out = vec_and(flat8out, (b8x16)m16);
841
842 b8x16 apply_16 = vec_and(flat8out, flat8in);
843 b8x16 apply_8 = vec_andc(flat8in, flat8out);
844
845 UNPACK_16(p6)
846 UNPACK_16(p5)
847 UNPACK_16(p4)
848 UNPACK_16(p3)
849 UNPACK_16(p2)
850 UNPACK_16(p1)
851 UNPACK_16(p0)
852
853 b8x16 apply_4 = vec_and(fm, vec_nor(apply_16, apply_8));
854
855 UNPACK_16(q0)
856 UNPACK_16(q1)
857 UNPACK_16(q2)
858 UNPACK_16(q3)
859 UNPACK_16(q4)
860 UNPACK_16(q5)
861 UNPACK_16(q6)
862
863 if (vec_any_ne(apply_4, zero)) {
864 APPLY_4
865 }
866
867 if (vec_any_ne(apply_16, zero)) {
868 APPLY_16
869 }
870
871 if (vec_any_ne(apply_8, zero)) {
872 APPLY_8
873 }
874
875 MERGEHL_U8(p5, p4)
876 MERGEHL_U8(p3, p2)
877 MERGEHL_U8(p1, p0)
878 MERGEHL_U8(q0, q1)
879 MERGEHL_U8(q2, q3)
880 MERGEHL_U8(q4, q5)
881
882 MERGEHL_U16(ab_p5p4p3p2, p5p4h, p3p2h)
883 MERGEHL_U16(cd_p5p4p3p2, p5p4l, p3p2l)
884 MERGEHL_U16(ab_p1p0q0q1, p1p0h, q0q1h)
885 MERGEHL_U16(cd_p1p0q0q1, p1p0l, q0q1l)
886 MERGEHL_U16(ab_q2q3q4q5, q2q3h, q4q5h)
887 MERGEHL_U16(cd_q2q3q4q5, q2q3l, q4q5l)
888
889
890 MERGEHL_U32(a_p5p4p3p2q2q3q4q5, ab_p5p4p3p2h, ab_q2q3q4q5h) // A0 p5p4p3p2 q2q3q4q5 A1
891 // A2 A3
892 MERGEHL_U32(a_p1p0q0q1q2q3q4q5, ab_p1p0q0q1h, ab_q2q3q4q5h) // A0 p1p0q0q1 q2q3q4q5 A1
893 // A2 A3
894 MERGEHL_U32(b_p5p4p3p2q2q3q4q5, ab_p5p4p3p2l, ab_q2q3q4q5l) // B0 p5p4p3p2 q2q3q4q5 B1
895 // A2 A3
896 MERGEHL_U32(b_p1p0q0q1q2q3q4q5, ab_p1p0q0q1l, ab_q2q3q4q5l) // B0 p1p0q0q1 q2q3q4q5 B1
897 // B2 B3
898 MERGEHL_U32(c_p5p4p3p2q2q3q4q5, cd_p5p4p3p2h, cd_q2q3q4q5h) // C0 p5p4p3p2 q2q3q4q5 C1
899 // C2 C3
900 MERGEHL_U32(c_p1p0q0q1q2q3q4q5, cd_p1p0q0q1h, cd_q2q3q4q5h) // C0 p1p0q0q1 q2q3q4q5 C1
901 // C2 C3
902 MERGEHL_U32(d_p5p4p3p2q2q3q4q5, cd_p5p4p3p2l, cd_q2q3q4q5l) // D0 p5p4p3p2 q2q3q4q5 D1
903 // D2 D3
904 MERGEHL_U32(d_p1p0q0q1q2q3q4q5, cd_p1p0q0q1l, cd_q2q3q4q5l) // D0 p1p0q0q1 q2q3q4q5 D1
905 // D2 D3
906
907 MERGEHL_U32(a01, a_p5p4p3p2q2q3q4q5h, a_p1p0q0q1q2q3q4q5h) // A0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5
908 // A1
909 vec_xst_len(a01h, dst2, 12);
910 dst2 += stridea;
911 vec_xst_len(a01l, dst2, 12);
912 dst2 += stridea;
913 MERGEHL_U32(a23, a_p5p4p3p2q2q3q4q5l, a_p1p0q0q1q2q3q4q5l) // A2
914 // A3
915 vec_xst_len(a23h, dst2, 12);
916 dst2 += stridea;
917 vec_xst_len(a23l, dst2, 12);
918 dst2 += stridea;
919 MERGEHL_U32(b01, b_p5p4p3p2q2q3q4q5h, b_p1p0q0q1q2q3q4q5h) // B0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5
920 // B1
921 vec_xst_len(b01h, dst2, 12);
922 dst2 += stridea;
923 vec_xst_len(b01l, dst2, 12);
924 dst2 += stridea;
925 MERGEHL_U32(b23, b_p5p4p3p2q2q3q4q5l, b_p1p0q0q1q2q3q4q5l) // B2
926 // B3
927 vec_xst_len(b23h, dst2, 12);
928 dst2 += stridea;
929 vec_xst_len(b23l, dst2, 12);
930 dst2 += stridea;
931 MERGEHL_U32(c01, c_p5p4p3p2q2q3q4q5h, c_p1p0q0q1q2q3q4q5h) // C0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5
932 // C1
933 vec_xst_len(c01h, dst2, 12);
934 dst2 += stridea;
935 vec_xst_len(c01l, dst2, 12);
936 dst2 += stridea;
937 MERGEHL_U32(c23, c_p5p4p3p2q2q3q4q5l, c_p1p0q0q1q2q3q4q5l) // C2
938 // C3
939 vec_xst_len(c23h, dst2, 12);
940 dst2 += stridea;
941 vec_xst_len(c23l, dst2, 12);
942 dst2 += stridea;
943 MERGEHL_U32(d01, d_p5p4p3p2q2q3q4q5h, d_p1p0q0q1q2q3q4q5h) // D0 p5p4p3p2 p1p0q0q1 q2q3q4q5 q2q3q4q5
944 // D1
945 vec_xst_len(d01h, dst2, 12);
946 dst2 += stridea;
947 vec_xst_len(d01l, dst2, 12);
948 dst2 += stridea;
949 MERGEHL_U32(d23, d_p5p4p3p2q2q3q4q5l, d_p1p0q0q1q2q3q4q5l) // D2
950 // D3
951 vec_xst_len(d23h, dst2, 12);
952 dst2 += stridea;
953 vec_xst_len(d23l, dst2, 12);
954 dst2 += stridea;
955 }
956
957 static inline void
loop_filter_v_4_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t strideb,b32x4 apply)958 loop_filter_v_4_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
959 const ptrdiff_t strideb, b32x4 apply)
960 {
961 uint8_t *p1d = dst + strideb * -2;
962 uint8_t *p0d = dst + strideb * -1;
963 uint8_t *q0d = dst + strideb * +0;
964 uint8_t *q1d = dst + strideb * +1;
965
966 u8x16 p1 = vec_xl(0, p1d);
967 u8x16 p0 = vec_xl(0, p0d);
968 u8x16 q0 = vec_xl(0, q0d);
969 u8x16 q1 = vec_xl(0, q1d);
970
971 const u8x16 zero = vec_splat_u8(0);
972 const u8x16 v1u8 = vec_splat_u8(1);
973 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
974
975 const u8x16 a_p1_p0 = vec_absd(p1, p0);
976 const u8x16 a_q1_q0 = vec_absd(q1, q0);
977 const u8x16 a_p0_q0 = vec_absd(p0, q0);
978 const u8x16 a_p1_q1 = vec_absd(p1, q1);
979
980 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
981 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
982 const u8x16 cmp_I = max_a_p1p0_q1q0;
983 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
984 const b8x16 ltI = vec_cmple(cmp_I, I);
985 const b8x16 ltE = vec_cmple(cmp_E, E);
986 b8x16 fm = vec_and(ltI, ltE);
987
988 fm = vec_and(fm, (b8x16)apply);
989 if (vec_all_eq(fm, zero))
990 return;
991
992 UNPACK_16(p0)
993 UNPACK_16(q0)
994
995 APPLY_4
996
997 vec_xst(p0, 0, p0d);
998 vec_xst(q0, 0, q0d);
999 vec_xst(q1, 0, q1d);
1000 vec_xst(p1, 0, p1d);
1001 }
1002
1003 static inline void
loop_filter_v_6_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t strideb,b32x4 apply,b32x4 m6)1004 loop_filter_v_6_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
1005 const ptrdiff_t strideb, b32x4 apply, b32x4 m6)
1006 {
1007 uint8_t *p2d = dst + strideb * -3;
1008 uint8_t *p1d = dst + strideb * -2;
1009 uint8_t *p0d = dst + strideb * -1;
1010 uint8_t *q0d = dst + strideb * +0;
1011 uint8_t *q1d = dst + strideb * +1;
1012 uint8_t *q2d = dst + strideb * +2;
1013
1014 u8x16 p2 = vec_xl(0, p2d);
1015 u8x16 p1 = vec_xl(0, p1d);
1016 u8x16 p0 = vec_xl(0, p0d);
1017 u8x16 q0 = vec_xl(0, q0d);
1018 u8x16 q1 = vec_xl(0, q1d);
1019 u8x16 q2 = vec_xl(0, q2d);
1020
1021 const u8x16 F = vec_splat_u8(1);
1022
1023 const u8x16 zero = vec_splat_u8(0);
1024 const u16x8 v3u16 = vec_splat_u16(3);
1025 const u16x8 v4u16 = vec_splat_u16(4);
1026 const u8x16 v1u8 = vec_splat_u8(1);
1027 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
1028
1029 const u8x16 a_p1_p0 = vec_absd(p1, p0);
1030 const u8x16 a_q1_q0 = vec_absd(q1, q0);
1031 const u8x16 a_p0_q0 = vec_absd(p0, q0);
1032 const u8x16 a_p1_q1 = vec_absd(p1, q1);
1033 const u8x16 a_p2_p1 = vec_absd(p2, p1);
1034 const u8x16 a_q2_q1 = vec_absd(q2, q1);
1035 const u8x16 a_p2_p0 = vec_absd(p2, p0);
1036 const u8x16 a_q2_q0 = vec_absd(q2, q0);
1037
1038 u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1);
1039 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
1040 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
1041 u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0);
1042 u8x16 cmp_I_m6 = max_a_p2p1_q2q1;
1043 u8x16 cmp_I_m4 = max_a_p1p0_q1q0;
1044 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
1045 cmp_I_m6 = vec_and(cmp_I_m6, (u8x16)m6);
1046 u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m6);
1047 const b8x16 ltE = vec_cmple(cmp_E, E);
1048 const b8x16 ltI = vec_cmple(cmp_I, I);
1049 b8x16 fm = vec_and(ltI, ltE);
1050
1051 fm = vec_and(fm, (b8x16)apply);
1052 if (vec_all_eq(fm, zero))
1053 return;
1054
1055 UNPACK_16(p2)
1056 UNPACK_16(p1)
1057 UNPACK_16(p0)
1058 UNPACK_16(q0)
1059 UNPACK_16(q1)
1060 UNPACK_16(q2)
1061
1062 m6 = vec_and(m6, (b32x4)fm);
1063
1064 u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0);
1065 b8x16 apply_6 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m6);
1066
1067 b8x16 apply_4 = vec_andc(fm, apply_6);
1068
1069 if (vec_any_ne(apply_4, zero)) {
1070 APPLY_4
1071 }
1072
1073 if (vec_any_ne(apply_6, zero)) {
1074 DECLARE_ADD_16HL(p2p2, p2, p2)
1075 DECLARE_ADD_16HL(p2p1, p2, p1)
1076 DECLARE_ADD_16HL(p1p0, p1, p0)
1077 DECLARE_ADD_16HL(p0q0, p0, q0)
1078 DECLARE_ADD_16HL(q0q1, q0, q1)
1079 DECLARE_ADD_16HL(q1q2, q1, q2)
1080 DECLARE_ADD_16HL(p2p2p0q0, p2p2, p0q0)
1081 DECLARE_ADD_16HL(p2p1p1p0, p2p1, p1p0)
1082 DECLARE_ADD_16HL(p1p0q1q2, p1p0, q1q2)
1083 DECLARE_ADD_16HL(p0q0q0q1, p0q0, q0q1)
1084 u16x8 q1q2q2q2h = q2h * 3 + q1h;
1085 u16x8 q1q2q2q2l = q2l * 3 + q1l;
1086
1087 DECLARE_ADD_16HL(op1, p2p2p0q0, p2p1p1p0)
1088 DECLARE_ADD_16HL(op0, p2p1p1p0, p0q0q0q1)
1089 DECLARE_ADD_16HL(oq0, p1p0q1q2, p0q0q0q1)
1090 DECLARE_ADD_16HL(oq1, p0q0q0q1, q1q2q2q2)
1091
1092 ADD_AND_SHIFT4(op1)
1093 ADD_AND_SHIFT4(op0)
1094 ADD_AND_SHIFT4(oq0)
1095 ADD_AND_SHIFT4(oq1)
1096
1097 p1 = PACK_AND_SEL(p1, apply_6);
1098 p0 = PACK_AND_SEL(p0, apply_6);
1099 q0 = PACK_AND_SEL(q0, apply_6);
1100 q1 = PACK_AND_SEL(q1, apply_6);
1101 }
1102
1103 vec_xst(p0, 0, p0d);
1104 vec_xst(q0, 0, q0d);
1105 vec_xst(q1, 0, q1d);
1106 vec_xst(p1, 0, p1d);
1107 }
1108
1109 static inline void
loop_filter_v_8_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t strideb,b32x4 apply,b32x4 m8)1110 loop_filter_v_8_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
1111 const ptrdiff_t strideb, b32x4 apply, b32x4 m8)
1112 {
1113
1114 uint8_t *p3d = dst + strideb * -4;
1115 uint8_t *p2d = dst + strideb * -3;
1116 uint8_t *p1d = dst + strideb * -2;
1117 uint8_t *p0d = dst + strideb * -1;
1118 uint8_t *q0d = dst + strideb * +0;
1119 uint8_t *q1d = dst + strideb * +1;
1120 uint8_t *q2d = dst + strideb * +2;
1121 uint8_t *q3d = dst + strideb * +3;
1122
1123 u8x16 p3 = vec_xl(0, p3d);
1124 u8x16 p2 = vec_xl(0, p2d);
1125 u8x16 p1 = vec_xl(0, p1d);
1126 u8x16 p0 = vec_xl(0, p0d);
1127 u8x16 q0 = vec_xl(0, q0d);
1128 u8x16 q1 = vec_xl(0, q1d);
1129 u8x16 q2 = vec_xl(0, q2d);
1130 u8x16 q3 = vec_xl(0, q3d);
1131
1132 const u8x16 F = vec_splat_u8(1);
1133
1134 const u8x16 zero = vec_splat_u8(0);
1135 const u16x8 v3u16 = vec_splat_u16(3);
1136 const u16x8 v4u16 = vec_splat_u16(4);
1137 const u8x16 v1u8 = vec_splat_u8(1);
1138 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
1139
1140 const u8x16 a_p1_p0 = vec_absd(p1, p0);
1141 const u8x16 a_q1_q0 = vec_absd(q1, q0);
1142 const u8x16 a_p0_q0 = vec_absd(p0, q0);
1143 const u8x16 a_p1_q1 = vec_absd(p1, q1);
1144 const u8x16 a_p2_p1 = vec_absd(p2, p1);
1145 const u8x16 a_q2_q1 = vec_absd(q2, q1);
1146 const u8x16 a_p2_p0 = vec_absd(p2, p0);
1147 const u8x16 a_q2_q0 = vec_absd(q2, q0);
1148 const u8x16 a_p3_p0 = vec_absd(p3, p0);
1149 const u8x16 a_q3_q0 = vec_absd(q3, q0);
1150 const u8x16 a_p3_p2 = vec_absd(p3, p2);
1151 const u8x16 a_q3_q2 = vec_absd(q3, q2);
1152
1153 u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1);
1154 u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2);
1155 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
1156 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
1157 const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0);
1158 u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0);
1159 u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2);
1160 u8x16 cmp_I_m4 = max_a_p1p0_q1q0;
1161 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
1162 cmp_I_m8 = vec_and(cmp_I_m8, (u8x16)m8);
1163 u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8);
1164 const b8x16 ltE = vec_cmple(cmp_E, E);
1165 const b8x16 ltI = vec_cmple(cmp_I, I);
1166 b8x16 fm = vec_and(ltI, ltE);
1167
1168 fm = vec_and(fm, (b8x16)apply);
1169 if (vec_all_eq(fm, zero))
1170 return;
1171
1172 #define UNPACK_16(v) \
1173 u16x8 v##h = u8h_to_u16(v); \
1174 u16x8 v##l = u8l_to_u16(v);
1175
1176 UNPACK_16(p3)
1177 UNPACK_16(p2)
1178 UNPACK_16(p1)
1179 UNPACK_16(p0)
1180 UNPACK_16(q0)
1181 UNPACK_16(q1)
1182 UNPACK_16(q2)
1183 UNPACK_16(q3)
1184
1185 m8 = vec_and(m8, (b32x4)fm);
1186
1187 u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0);
1188 cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in);
1189 b8x16 apply_8 = vec_and(vec_cmple(cmp_flat8in, F), (b8x16)m8);
1190
1191 b8x16 apply_4 = vec_andc(fm, apply_8);
1192
1193 if (vec_any_ne(apply_4, zero)) {
1194 APPLY_4
1195 }
1196
1197 if (vec_any_ne(apply_8, zero)) {
1198 APPLY_8
1199 }
1200
1201 vec_xst(p0, 0, p0d);
1202 vec_xst(q0, 0, q0d);
1203 vec_xst(q1, 0, q1d);
1204 vec_xst(p1, 0, p1d);
1205 vec_xst(q2, 0, q2d);
1206 vec_xst(p2, 0, p2d);
1207 }
1208
1209 static inline void
loop_filter_v_16_all(uint8_t * dst,u8x16 E,u8x16 I,u8x16 H,const ptrdiff_t strideb,b32x4 apply,b32x4 m8,b32x4 m16)1210 loop_filter_v_16_all(uint8_t *dst, u8x16 E, u8x16 I, u8x16 H,
1211 const ptrdiff_t strideb, b32x4 apply, b32x4 m8, b32x4 m16)
1212 {
1213
1214 uint8_t *p6d = dst + strideb * -7;
1215 uint8_t *p5d = dst + strideb * -6;
1216 uint8_t *p4d = dst + strideb * -5;
1217 uint8_t *p3d = dst + strideb * -4;
1218 uint8_t *p2d = dst + strideb * -3;
1219 uint8_t *p1d = dst + strideb * -2;
1220 uint8_t *p0d = dst + strideb * -1;
1221 uint8_t *q0d = dst + strideb * +0;
1222 uint8_t *q1d = dst + strideb * +1;
1223 uint8_t *q2d = dst + strideb * +2;
1224 uint8_t *q3d = dst + strideb * +3;
1225 uint8_t *q4d = dst + strideb * +4;
1226 uint8_t *q5d = dst + strideb * +5;
1227 uint8_t *q6d = dst + strideb * +6;
1228
1229 u8x16 p6 = vec_xl(0, p6d);
1230 u8x16 p5 = vec_xl(0, p5d);
1231 u8x16 p4 = vec_xl(0, p4d);
1232 u8x16 p3 = vec_xl(0, p3d);
1233 u8x16 p2 = vec_xl(0, p2d);
1234 u8x16 p1 = vec_xl(0, p1d);
1235 u8x16 p0 = vec_xl(0, p0d);
1236 u8x16 q0 = vec_xl(0, q0d);
1237 u8x16 q1 = vec_xl(0, q1d);
1238 u8x16 q2 = vec_xl(0, q2d);
1239 u8x16 q3 = vec_xl(0, q3d);
1240 u8x16 q4 = vec_xl(0, q4d);
1241 u8x16 q5 = vec_xl(0, q5d);
1242 u8x16 q6 = vec_xl(0, q6d);
1243
1244 const u8x16 F = vec_splat_u8(1);
1245
1246 const u8x16 zero = vec_splat_u8(0);
1247 const u16x8 v3u16 = vec_splat_u16(3);
1248 const u16x8 v4u16 = vec_splat_u16(4);
1249 const u16x8 v8u16 = vec_splat_u16(8);
1250 const u8x16 v1u8 = vec_splat_u8(1);
1251 const b8x16 s = (b8x16)vec_splats((uint8_t)128);
1252
1253 const u8x16 a_p6_p0 = vec_absd(p6, p0);
1254 const u8x16 a_p5_p0 = vec_absd(p5, p0);
1255 const u8x16 a_p4_p0 = vec_absd(p4, p0);
1256 const u8x16 a_q4_q0 = vec_absd(q4, q0);
1257 const u8x16 a_q5_q0 = vec_absd(q5, q0);
1258 const u8x16 a_q6_q0 = vec_absd(q6, q0);
1259
1260 const u8x16 a_p1_p0 = vec_absd(p1, p0);
1261 const u8x16 a_q1_q0 = vec_absd(q1, q0);
1262 const u8x16 a_p0_q0 = vec_absd(p0, q0);
1263 const u8x16 a_p1_q1 = vec_absd(p1, q1);
1264 const u8x16 a_p2_p1 = vec_absd(p2, p1);
1265 const u8x16 a_q2_q1 = vec_absd(q2, q1);
1266 const u8x16 a_p2_p0 = vec_absd(p2, p0);
1267 const u8x16 a_q2_q0 = vec_absd(q2, q0);
1268 const u8x16 a_p3_p0 = vec_absd(p3, p0);
1269 const u8x16 a_q3_q0 = vec_absd(q3, q0);
1270 const u8x16 a_p3_p2 = vec_absd(p3, p2);
1271 const u8x16 a_q3_q2 = vec_absd(q3, q2);
1272
1273 u8x16 max_a_p2p1_q2q1 = vec_max(a_p2_p1, a_q2_q1);
1274 u8x16 max_a_p3p2_q3q2 = vec_max(a_p3_p2, a_q3_q2);
1275 u8x16 cmp_E = vec_adds(a_p0_q0, a_p0_q0);
1276 const u8x16 max_a_p1p0_q1q0 = vec_max(a_p1_p0, a_q1_q0);
1277 const u8x16 max_a_p2p0_q2q0 = vec_max(a_p2_p0, a_q2_q0);
1278
1279 const u8x16 max_a_p4p0_q4q0 = vec_max(a_p4_p0, a_q4_q0);
1280 const u8x16 max_a_p5p0_q5q0 = vec_max(a_p5_p0, a_q5_q0);
1281 const u8x16 max_a_p6p0_q6q0 = vec_max(a_p6_p0, a_q6_q0);
1282
1283 b32x4 m8_16 = vec_or(m8, m16);
1284
1285 u8x16 max_a_p3p0_q3q0 = vec_max(a_p3_p0, a_q3_q0);
1286 u8x16 cmp_I_m8 = vec_max(max_a_p2p1_q2q1, max_a_p3p2_q3q2);
1287 u8x16 cmp_I_m4 = max_a_p1p0_q1q0;
1288 cmp_E = vec_adds(vec_sr(a_p1_q1, v1u8), cmp_E);
1289 cmp_I_m8 = vec_and(cmp_I_m8, (u8x16)m8_16);
1290 u8x16 cmp_I = vec_max(cmp_I_m4, cmp_I_m8);
1291 const b8x16 ltE = vec_cmple(cmp_E, E);
1292 const b8x16 ltI = vec_cmple(cmp_I, I);
1293 b8x16 fm = vec_and(ltI, ltE);
1294
1295 fm = vec_and(fm, (b8x16)apply);
1296 if (vec_all_eq(fm, zero))
1297 return;
1298
1299 u8x16 cmp_flat8in = vec_max(max_a_p2p0_q2q0, max_a_p1p0_q1q0);
1300 u8x16 cmp_flat8out = vec_max(max_a_p6p0_q6q0, max_a_p5p0_q5q0);
1301
1302 m8_16 = vec_and(m8_16, (b32x4)fm);
1303 m16 = vec_and(m16, (b32x4)fm);
1304
1305 cmp_flat8in = vec_max(max_a_p3p0_q3q0, cmp_flat8in);
1306 cmp_flat8out = vec_max(max_a_p4p0_q4q0, cmp_flat8out);
1307 b8x16 flat8in = vec_cmple(cmp_flat8in, F);
1308 b8x16 flat8out = vec_cmple(cmp_flat8out, F);
1309 flat8in = vec_and(flat8in, (b8x16)m8_16);
1310 flat8out = vec_and(flat8out, (b8x16)m16);
1311
1312 b8x16 apply_16 = vec_and(flat8out, flat8in);
1313 b8x16 apply_8 = vec_andc(flat8in, flat8out);
1314
1315 UNPACK_16(p6)
1316 UNPACK_16(p5)
1317 UNPACK_16(p4)
1318 UNPACK_16(p3)
1319 UNPACK_16(p2)
1320 UNPACK_16(p1)
1321 UNPACK_16(p0)
1322
1323 b8x16 apply_4 = vec_nor(apply_16, apply_8);
1324
1325 UNPACK_16(q0)
1326 UNPACK_16(q1)
1327 UNPACK_16(q2)
1328 UNPACK_16(q3)
1329 UNPACK_16(q4)
1330 UNPACK_16(q5)
1331 UNPACK_16(q6)
1332
1333 if (vec_any_ne(apply_4, zero)) {
1334 APPLY_4
1335 }
1336 if (vec_any_ne(apply_16, zero)) {
1337 APPLY_16
1338 }
1339 if (vec_any_ne(apply_8, zero)) {
1340 APPLY_8
1341 }
1342
1343 vec_xst(p5, 0, p5d);
1344 vec_xst(p4, 0, p4d);
1345 vec_xst(p3, 0, p3d);
1346 vec_xst(p2, 0, p2d);
1347 vec_xst(p1, 0, p1d);
1348 vec_xst(p0, 0, p0d);
1349 vec_xst(q0, 0, q0d);
1350 vec_xst(q1, 0, q1d);
1351 vec_xst(q2, 0, q2d);
1352 vec_xst(q3, 0, q3d);
1353 vec_xst(q4, 0, q4d);
1354 vec_xst(q5, 0, q5d);
1355 }
1356
1357 #if defined(DAV1D_VSX)
1358 #define LPF(fn) BF(dav1d_lpf_##fn, vsx)
1359 #elif defined(DAV1D_PWR9)
1360 #define LPF(fn) BF(dav1d_lpf_##fn, pwr9)
1361 #endif
1362
LPF(h_sb_y)1363 void LPF(h_sb_y)(pixel *dst, const ptrdiff_t stride,
1364 const uint32_t *const vmask,
1365 const uint8_t (*l)[4], ptrdiff_t b4_stride,
1366 const Av1FilterLUT *lut, const int h)
1367 {
1368 unsigned vm = vmask[0] | vmask[1] | vmask[2];
1369
1370 u32x4 vm0 = vec_splats(vmask[0] | vmask[1] | vmask[2]);
1371 u32x4 vm1 = vec_splats(vmask[1]);
1372 u32x4 vm2 = vec_splats(vmask[2]);
1373 u32x4 mm = (u32x4){1, 2, 4, 8};
1374
1375 const u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp);
1376 const u8x16 s0 = vec_splat(sharp, 0);
1377 const u8x16 s1 = vec_splat(sharp, 8);
1378 const u32x4 v4u32 = vec_splat_u32(4);
1379 const u32x4 zero = vec_splat_u32(0);
1380 const u8x16 v1u8 = vec_splat_u8(1);
1381 const u8x16 v2u8 = vec_splat_u8(2);
1382 const u8x16 v4u8 = vec_splat_u8(4);
1383 const uint8_t (*pl)[4] = &l[-1];
1384
1385 const u8x16 spread = (u8x16){
1386 0x00, 0x00, 0x00, 0x00,
1387 0x04, 0x04, 0x04, 0x04,
1388 0x08, 0x08, 0x08, 0x08,
1389 0x0c, 0x0c, 0x0c, 0x0c,
1390 };
1391
1392 for (;
1393 vm;
1394 vm >>= 4,
1395 mm = vec_sl(mm, v4u32),
1396 dst += 4 * 4 * PXSTRIDE(stride),
1397 pl += 4 * b4_stride) {
1398 if (!(vm & 0x0f))
1399 continue;
1400 u32x4 la = (u32x4)vec_xl(0, (uint8_t *)pl); // l[-1] l[0] ...
1401 u32x4 lb = (u32x4)vec_xl(1 * 4 * b4_stride, (uint8_t *)pl);
1402 u32x4 lc = (u32x4)vec_xl(2 * 4 * b4_stride, (uint8_t *)pl);
1403 u32x4 ld = (u32x4)vec_xl(3 * 4 * b4_stride, (uint8_t *)pl);
1404
1405 u32x4 Lac = vec_mergeh(la, lc); // la[-1] lb[-1] la[0] lb[0]
1406 u32x4 Lbd = vec_mergeh(lb, ld); // lc[-1] ld[-1] lc[0] ld[0]
1407
1408 u32x4 wd16 = vec_and(vm2, mm); // vmask[2] & [1,2,4,8]
1409 u32x4 wd8 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8]
1410 u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8]
1411
1412 u32x4 L_1 = (u32x4)vec_mergeh(Lac, Lbd); // la[-1] lb[-1] lc[-1] ld[-1]
1413 u32x4 L_0 = (u32x4)vec_mergel(Lac, Lbd); // la[ 0] lb[ 0] lc[ 0] ld[ 0]
1414
1415 b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero);
1416
1417 u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_1, mask); // if !l[0][0] { l[-1][0] }
1418
1419 u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ...
1420
1421 b32x4 m16 = vec_cmpeq(wd16, mm);
1422 b32x4 m8 = vec_cmpeq(wd8, mm);
1423 b32x4 m4 = vec_cmpeq(wd4, mm);
1424
1425 b32x4 apply = vec_cmpne((u32x4)L, zero);
1426
1427 if (vec_all_eq((u32x4)L, zero))
1428 continue;
1429
1430 u8x16 I = vec_sr(L, s0); // L >> sharp[0]
1431 u8x16 H = vec_sr(L, v4u8);
1432 I = vec_min(I, s1); // min(L >> sharp[0], sharp[1])
1433 u8x16 E = vec_add(L, v2u8); // L + 2
1434 I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1)
1435 E = vec_add(E, E); // 2 * (L + 2)
1436 E = vec_add(E, I); // 2 * (L + 2) + limit
1437
1438 apply = vec_and(m4, apply);
1439
1440 if (vec_any_ne(wd16, zero)) {
1441 loop_filter_h_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16);
1442 } else if (vec_any_ne(wd8, zero)) {
1443 loop_filter_h_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8);
1444 } else { // wd4 == 0 already tested
1445 loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply);
1446 }
1447 }
1448 }
1449
LPF(v_sb_y)1450 void LPF(v_sb_y)(pixel *dst, const ptrdiff_t stride,
1451 const uint32_t *const vmask,
1452 const uint8_t (*l)[4], ptrdiff_t b4_stride,
1453 const Av1FilterLUT *lut, const int w)
1454 {
1455 unsigned vm = vmask[0] | vmask[1] | vmask[2];
1456
1457 u32x4 vm0 = vec_splats(vmask[0] | vmask[1] | vmask[2]);
1458 u32x4 vm1 = vec_splats(vmask[1]);
1459 u32x4 vm2 = vec_splats(vmask[2]);
1460
1461 u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp);
1462 u8x16 s0 = vec_splat(sharp, 0);
1463 u8x16 s1 = vec_splat(sharp, 8);
1464 u32x4 mm = (u32x4){1, 2, 4, 8};
1465 u32x4 v4u32 = vec_splat_u32(4);
1466 u32x4 zero = vec_splat_u32(0);
1467 u8x16 v1u8 = vec_splat_u8(1);
1468 u8x16 v2u8 = vec_splat_u8(2);
1469 u8x16 v4u8 = vec_splat_u8(4);
1470 const uint8_t (*pl)[4] = l;
1471 const uint8_t (*plb4)[4] = l - b4_stride;
1472 const u8x16 spread = (u8x16){
1473 0x00, 0x00, 0x00, 0x00,
1474 0x04, 0x04, 0x04, 0x04,
1475 0x08, 0x08, 0x08, 0x08,
1476 0x0c, 0x0c, 0x0c, 0x0c,
1477 };
1478
1479 for (;
1480 vm;
1481 vm >>= 4,
1482 mm = vec_sl(mm, v4u32),
1483 dst += 4 * 4,
1484 pl += 4,
1485 plb4 += 4) {
1486 if (!(vm & 0x0f))
1487 continue;
1488 u32x4 L_0 = (u32x4)vec_xl(0, (uint8_t *)pl);
1489 u32x4 L_b4 = (u32x4)vec_xl(0, (uint8_t *)plb4);
1490
1491 u32x4 wd16 = vec_and(vm2, mm); // vmask[2] & [1,2,4,8]
1492 u32x4 wd8 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8]
1493 u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8]
1494
1495 b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero);
1496
1497 u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_b4, mask); // if !l[0][0] { l[-b4_stride][0] }
1498
1499 u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ...
1500
1501 b32x4 m16 = vec_cmpeq(wd16, mm);
1502 b32x4 m8 = vec_cmpeq(wd8, mm);
1503 b32x4 m4 = vec_cmpeq(wd4, mm);
1504
1505 b32x4 apply = vec_cmpne((u32x4)L, zero);
1506
1507 if (vec_all_eq((u32x4)L, zero))
1508 continue;
1509
1510 u8x16 I = vec_sr(L, s0); // L >> sharp[0]
1511 u8x16 H = vec_sr(L, v4u8);
1512 I = vec_min(I, s1); // min(L >> sharp[0], sharp[1])
1513 u8x16 E = vec_add(L, v2u8); // L + 2
1514 I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1)
1515 E = vec_add(E, E); // 2 * (L + 2)
1516 E = vec_add(E, I); // 2 * (L + 2) + limit
1517
1518 apply = vec_and(apply, m4);
1519
1520 if (vec_any_ne(wd16, zero)) {
1521 loop_filter_v_16_all(dst, E, I, H, PXSTRIDE(stride), apply, m8, m16);
1522 } else if (vec_any_ne(wd8, zero)) {
1523 loop_filter_v_8_all(dst, E, I, H, PXSTRIDE(stride), apply, m8);
1524 } else {
1525 loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply);
1526 }
1527
1528 }
1529 }
1530
LPF(h_sb_uv)1531 void LPF(h_sb_uv)(pixel *dst, const ptrdiff_t stride,
1532 const uint32_t *const vmask,
1533 const uint8_t (*l)[4], ptrdiff_t b4_stride,
1534 const Av1FilterLUT *lut, const int h)
1535 {
1536 unsigned vm = vmask[0] | vmask[1];
1537 u32x4 vm0 = vec_splats(vm);
1538 u32x4 vm1 = vec_splats(vmask[1]);
1539 u32x4 mm = (u32x4){1, 2, 4, 8};
1540
1541 const u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp);
1542 const u8x16 s0 = vec_splat(sharp, 0);
1543 const u8x16 s1 = vec_splat(sharp, 8);
1544 const u32x4 v4u32 = vec_splat_u32(4);
1545 const u32x4 zero = vec_splat_u32(0);
1546 const u8x16 v1u8 = vec_splat_u8(1);
1547 const u8x16 v2u8 = vec_splat_u8(2);
1548 const u8x16 v4u8 = vec_splat_u8(4);
1549 const uint8_t (*pl)[4] = &l[-1];
1550 const u8x16 spread = (u8x16){
1551 0x00, 0x00, 0x00, 0x00,
1552 0x04, 0x04, 0x04, 0x04,
1553 0x08, 0x08, 0x08, 0x08,
1554 0x0c, 0x0c, 0x0c, 0x0c,
1555 };
1556
1557 for (;
1558 vm;
1559 vm >>= 4,
1560 mm = vec_sl(mm, v4u32),
1561 dst += 4 * 4 * PXSTRIDE(stride),
1562 pl += 4 * b4_stride) {
1563 if (!(vm & 0x0f))
1564 continue;
1565 u32x4 la = (u32x4)vec_xl(0, (uint8_t *)pl); // l[-1] l[0] ...
1566 u32x4 lb = (u32x4)vec_xl(1 * 4 * b4_stride, (uint8_t *)pl);
1567 u32x4 lc = (u32x4)vec_xl(2 * 4 * b4_stride, (uint8_t *)pl);
1568 u32x4 ld = (u32x4)vec_xl(3 * 4 * b4_stride, (uint8_t *)pl);
1569
1570 u32x4 Lac = vec_mergeh(la, lc); // la[-1] lb[-1] la[0] lb[0]
1571 u32x4 Lbd = vec_mergeh(lb, ld); // lc[-1] ld[-1] lc[0] ld[0]
1572
1573 u32x4 wd6 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8]
1574 u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8]
1575
1576 u32x4 L_1 = (u32x4)vec_mergeh(Lac, Lbd); // la[-1] lb[-1] lc[-1] ld[-1]
1577 u32x4 L_0 = (u32x4)vec_mergel(Lac, Lbd); // la[ 0] lb[ 0] lc[ 0] ld[ 0]
1578
1579 b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero);
1580
1581 u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_1, mask); // if !l[0][0] { l[-1][0] }
1582
1583 u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ...
1584
1585 b32x4 m6 = vec_cmpeq(wd6, mm);
1586 b32x4 m4 = vec_cmpeq(wd4, mm);
1587
1588 b32x4 apply = vec_cmpne((u32x4)L, zero);
1589
1590 if (vec_all_eq((u32x4)L, zero))
1591 continue;
1592
1593 u8x16 I = vec_sr(L, s0); // L >> sharp[0]
1594 u8x16 H = vec_sr(L, v4u8);
1595 I = vec_min(I, s1); // min(L >> sharp[0], sharp[1])
1596 u8x16 E = vec_add(L, v2u8); // L + 2
1597 I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1)
1598 E = vec_add(E, E); // 2 * (L + 2)
1599 E = vec_add(E, I); // 2 * (L + 2) + limit
1600
1601 apply = vec_and(m4, apply);
1602
1603 if (vec_any_ne(wd6, zero)) {
1604 loop_filter_h_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6);
1605 // loop_filter_h_8
1606 } else { // wd4 == 0 already tested
1607 loop_filter_h_4_all(dst, E, I, H, PXSTRIDE(stride), apply);
1608
1609 // loop_filter_h_4
1610 }
1611
1612 }
1613 }
1614
LPF(v_sb_uv)1615 void LPF(v_sb_uv)(pixel *dst, const ptrdiff_t stride,
1616 const uint32_t *const vmask,
1617 const uint8_t (*l)[4], ptrdiff_t b4_stride,
1618 const Av1FilterLUT *lut, const int w)
1619 {
1620 unsigned vm = vmask[0] | vmask[1];
1621
1622 u32x4 vm0 = vec_splats(vm);
1623 u32x4 vm1 = vec_splats(vmask[1]);
1624
1625 u8x16 sharp = vec_xl(0, (uint8_t *)lut->sharp);
1626 u8x16 s0 = vec_splat(sharp, 0);
1627 u8x16 s1 = vec_splat(sharp, 8);
1628 u32x4 mm = (u32x4){1, 2, 4, 8};
1629 u32x4 v4u32 = vec_splat_u32(4);
1630 u32x4 zero = vec_splat_u32(0);
1631 u8x16 v1u8 = vec_splat_u8(1);
1632 u8x16 v2u8 = vec_splat_u8(2);
1633 u8x16 v4u8 = vec_splat_u8(4);
1634 const uint8_t (*pl)[4] = l;
1635 const uint8_t (*plb4)[4] = l - b4_stride;
1636 const u8x16 spread = (u8x16){
1637 0x00, 0x00, 0x00, 0x00,
1638 0x04, 0x04, 0x04, 0x04,
1639 0x08, 0x08, 0x08, 0x08,
1640 0x0c, 0x0c, 0x0c, 0x0c,
1641 };
1642
1643 for (;
1644 vm;
1645 vm >>= 4,
1646 mm = vec_sl(mm, v4u32),
1647 dst += 4 * 4,
1648 pl += 4,
1649 plb4 += 4) {
1650 if (!(vm & 0x0f))
1651 continue;
1652 u32x4 L_0 = (u32x4)vec_xl(0, (uint8_t *)pl);
1653 u32x4 L_b4 = (u32x4)vec_xl(0, (uint8_t *)plb4);
1654
1655 u32x4 wd6 = vec_and(vm1, mm); // vmask[1] & [1,2,4,8]
1656 u32x4 wd4 = vec_and(vm0, mm); // vm & [1,2,4,8]
1657
1658 b8x16 mask = vec_cmpeq((u8x16)L_0, (u8x16)zero);
1659
1660 u32x4 L4 = (u32x4)vec_sel((u8x16)L_0, (u8x16)L_b4, mask); // if !l[0][0] { l[-b4_stride][0] }
1661
1662 u8x16 L = (u8x16)vec_perm((u8x16)L4, (u8x16)L4, spread); // La La La La Lb Lb Lb Lb ...
1663
1664 b32x4 m6 = vec_cmpeq(wd6, mm);
1665 b32x4 m4 = vec_cmpeq(wd4, mm);
1666
1667 b32x4 apply = vec_cmpne((u32x4)L, zero);
1668
1669 if (vec_all_eq((u32x4)L, zero))
1670 continue;
1671
1672 u8x16 I = vec_sr(L, s0); // L >> sharp[0]
1673 u8x16 H = vec_sr(L, v4u8);
1674 I = vec_min(I, s1); // min(L >> sharp[0], sharp[1])
1675 u8x16 E = vec_add(L, v2u8); // L + 2
1676 I = vec_max(I, v1u8); // max(min(L >> sharp[0], sharp[1]), 1)
1677 E = vec_add(E, E); // 2 * (L + 2)
1678 E = vec_add(E, I); // 2 * (L + 2) + limit
1679
1680 apply = vec_and(apply, m4);
1681
1682 if (vec_any_ne(wd6, zero)) {
1683 loop_filter_v_6_all(dst, E, I, H, PXSTRIDE(stride), apply, m6);
1684 } else {
1685 loop_filter_v_4_all(dst, E, I, H, PXSTRIDE(stride), apply);
1686 }
1687 }
1688 }
1689
1690 #endif // BITDEPTH
1691