xref: /aosp_15_r20/external/libdav1d/src/loongarch/loopfilter.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2023, VideoLAN and dav1d authors
3 * Copyright © 2023, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/loongarch/loongson_asm.S"
29#include "src/loongarch/loongson_util.S"
30
31// depending on how many pixels need to be stored, returns:
32// t4 = (1 << 0) : 0 pixels
33// t4 = (1 << 4) : inner 4 pixels
34// t4 = (1 << 6) : inner 6 pixels
35// t4 = 0 : all pixels
36.macro FILTER wd
37functionl lpf_16_wd\wd\()_lsx
38    vabsd.bu         vr0,   vr22,  vr23 // abs(p1 - p0)
39    vabsd.bu         vr1,   vr25,  vr24 // abs(q1 - q0)
40    vabsd.bu         vr2,   vr23,  vr24 // abs(p0 - q0)
41    vabsd.bu         vr3,   vr22,  vr25 // abs(p1 - q1)
42.if \wd >= 6
43    vabsd.bu         vr4,   vr21,  vr22 // abs(p2 - p1)
44    vabsd.bu         vr5,   vr26,  vr25 // abs(q2 - q1)
45.endif
46.if \wd >= 8
47    vabsd.bu         vr6,   vr20,  vr21 // abs(p3 - p2)
48    vabsd.bu         vr7,   vr27,  vr26 // abs(q3 - q3)
49.endif
50.if \wd >= 6
51    vmax.bu          vr4,   vr4,   vr5
52.endif
53    vsadd.bu         vr2,   vr2,   vr2  // abs(p0 - q0) * 2
54.if \wd >= 8
55    vmax.bu          vr6,   vr6,   vr7
56.endif
57    vsrli.b          vr3,   vr3,   1    // abs(p1 - q1) >> 1
58.if \wd >= 8
59    vmax.bu          vr4,   vr4,   vr6
60.endif
61.if \wd >= 6
62    vand.v           vr4,   vr4,   vr14
63.endif
64    vmax.bu          vr0,   vr0,   vr1  // max(abs(p1 - p0), abs(q1 - q0))
65    vsadd.bu         vr2,   vr2,   vr3  // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
66.if \wd >= 6
67    vmax.bu          vr4,   vr0,   vr4
68    vsle.bu          vr1,   vr4,   vr11 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
69.else
70    vsle.bu          vr1,   vr0,   vr11 // max(abs(p1 - p0), abs(q1 - q0)) <= I
71.endif
72    vsle.bu          vr2,   vr2,   vr10 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
73    vand.v           vr1,   vr1,   vr2  // fm
74    vand.v           vr1,   vr1,   vr13 // fm && wd >= 4
75.if \wd >= 6
76    vand.v           vr14,  vr14,  vr1  // fm && wd > 4
77.endif
78.if \wd >= 16
79    vand.v           vr15,  vr15,  vr1  // fm && wd == 16
80.endif
81    vhaddw.qu.du     vr8,   vr1,   vr1
82    vpickve2gr.du    t6,    vr8,   0
83    bnez             t6,    9f          // if (!fm || wd < 4) return;
84    li.w             t4,    1 << 0
85    jirl             zero,  ra,    0x00
86
879:
88.if \wd >= 6
89    vabsd.bu         vr2,   vr21,  vr23 // abs(p2 - p0)
90    vabsd.bu         vr3,   vr22,  vr23 // abs(p1 - p0)
91    vabsd.bu         vr4,   vr25,  vr24 // abs(q1 - q0)
92    vabsd.bu         vr5,   vr26,  vr24 // abs(q2 - q0)
93.if \wd >= 8
94    vabsd.bu         vr6,   vr20,  vr23 // abs(p3 - p0)
95    vabsd.bu         vr7,   vr27,  vr24 // abs(q3 - q0)
96.endif
97    vmax.bu          vr2,   vr2,   vr3
98    vmax.bu          vr4,   vr4,   vr5
99.if \wd >= 8
100    vmax.bu          vr6,   vr6,   vr7
101.endif
102    vmax.bu          vr2,   vr2,   vr4
103.if \wd >= 8
104    vmax.bu          vr2,   vr2,   vr6
105.endif
106
107.if \wd == 16
108    vabsd.bu         vr3,   vr17,  vr23 // abs(p6 - p0)
109    vabsd.bu         vr4,   vr18,  vr23 // abs(p5 - p0)
110    vabsd.bu         vr5,   vr19,  vr23 // abs(p4 - p0)
111.endif
112    vslei.bu         vr2,   vr2,   1    // flat8in
113.if \wd == 16
114    vabsd.bu         vr6,   vr28,  vr24 // abs(q4 - q0)
115    vabsd.bu         vr7,   vr29,  vr24 // abs(q5 - q0)
116    vabsd.bu         vr8,   vr30,  vr24 // abs(q6 - q0)
117.endif
118    vand.v           vr14,  vr2,   vr14 // flat8in && fm && wd > 4
119    vandn.v          vr1,   vr14,  vr1  // fm && wd >= 4 && !flat8in
120.if \wd == 16
121    vmax.bu          vr3,   vr3,   vr4
122    vmax.bu          vr5,   vr5,   vr6
123.endif
124    vhaddw.qu.du     vr9,   vr1,   vr1
125.if \wd == 16
126    vmax.bu          vr7,   vr7,   vr8
127    vmax.bu          vr3,   vr3,   vr5
128    vmax.bu          vr3,   vr3,   vr7
129    vslei.bu         vr3,   vr3,   1    // flat8out
130.endif
131    vpickve2gr.du    t6,    vr9,   0
132.if \wd == 16
133    vand.v           vr15,  vr15,  vr3  // flat8out && fm && wd == 16
134    vand.v           vr15,  vr15,  vr14 // flat8out && flat8in && fm && wd == 16
135    vandn.v          vr14,  vr15,  vr14 // flat8in && fm && wd >= 4 && !flat8out
136.endif
137    beqz             t6,    1f          // skip wd == 4 case
138.endif
139    vxori.b          vr2,   vr22,  128  // p1 - 128
140    vxori.b          vr3,   vr25,  128  // q1 - 128
141    vslt.bu          vr0,   vr12,  vr0  // hev
142    vssub.b          vr2,   vr2,   vr3  // iclip_diff(p1 - q1)
143    vand.v           vr4,   vr2,   vr0  // if (hev) iclip_diff(p1 - q1)
144    vandn.v          vr0,   vr0,   vr1  // (fm && wd >= 4 && !hev)
145    vxor.v           vr5,   vr5,   vr5
146    vaddi.hu         vr5,   vr5,   3
147    vsubwev.h.bu     vr2,   vr24,  vr23
148    vsubwod.h.bu     vr3,   vr24,  vr23
149    vmul.h           vr2,   vr2,   vr5
150    vmul.h           vr3,   vr3,   vr5
151    vxor.v           vr6,   vr6,   vr6
152    vaddwev.h.b      vr7,   vr4,   vr6
153    vaddwod.h.b      vr6,   vr4,   vr6
154    vadd.h           vr2,   vr2,   vr7
155    vadd.h           vr3,   vr3,   vr6
156    vssrani.b.h      vr2,   vr2,   0
157    vssrani.b.h      vr3,   vr3,   0
158    vilvl.b          vr2,   vr3,   vr2 // f
159    vxor.v           vr6,   vr6,   vr6
160    vaddi.bu         vr5,   vr6,   3
161    vaddi.bu         vr6,   vr6,   4   // 4
162    vsadd.b          vr4,   vr6,   vr2 // imin(f + 4, 127)
163    vsadd.b          vr5,   vr5,   vr2 // imin(f + 3, 127)
164    vsrai.b          vr4,   vr4,   3   // f1
165    vsrai.b          vr5,   vr5,   3   // f2
166    vaddi.bu         vr2,   vr23,  0   // p0
167    vaddi.bu         vr3,   vr24,  0   // q0
168    vxori.b          vr2,   vr2,   128
169    vxori.b          vr3,   vr3,   128
170    vsadd.b          vr2,   vr2,   vr5 // p0 + f2 out p0
171    vssub.b          vr3,   vr3,   vr4 // q0 - f1 out q0
172    vxori.b          vr2,   vr2,   128
173    vxori.b          vr3,   vr3,   128
174    vsrari.b         vr4,   vr4,   1   // (f1 + 1) >> 1
175    vbitsel.v        vr23,  vr23,  vr2,   vr1 // if (fm && wd >= 4)
176    vbitsel.v        vr24,  vr24,  vr3,   vr1 // if (fm && wd >= 4)
177    vaddi.bu         vr2,   vr22,  0   // p1
178    vaddi.bu         vr3,   vr25,  0   // q1
179    vxori.b          vr2,   vr2,   128
180    vxori.b          vr3,   vr3,   128
181    vsadd.b          vr2,   vr2,   vr4 // out p1
182    vssub.b          vr3,   vr3,   vr4 // out q1
183    vxori.b          vr2,   vr2,   128
184    vxori.b          vr3,   vr3,   128
185    vbitsel.v        vr22,  vr22,  vr2,   vr0 // if (fm && wd >= 4 && !hev)
186    vbitsel.v        vr25,  vr25,  vr3,   vr0 // if (fm && wd >= 4 && !hev)
1871:
188
189.if \wd == 6
190    vhaddw.qu.du     vr0,   vr14,  vr14
191    vpickve2gr.du    t6,    vr0,   0
192    beqz             t6,    2f        // skip if there's no flat8in
193
194    vaddwev.h.bu     vr0,   vr21,  vr21
195    vaddwod.h.bu     vr1,   vr21,  vr21 // p2 * 2
196    vaddwev.h.bu     vr2,   vr21,  vr22
197    vaddwod.h.bu     vr3,   vr21,  vr22 // p2 + p1
198    vaddwev.h.bu     vr4,   vr22,  vr23
199    vaddwod.h.bu     vr5,   vr22,  vr23 // p1 + p0
200    vaddwev.h.bu     vr6,   vr23,  vr24
201    vaddwod.h.bu     vr7,   vr23,  vr24 // p0 + q0
202    vadd.h           vr8,   vr0,   vr2
203    vadd.h           vr9,   vr1,   vr3
204    vadd.h           vr10,  vr4,   vr6
205    vadd.h           vr11,  vr5,   vr7
206    vaddwev.h.bu     vr12,  vr24,  vr25
207    vaddwod.h.bu     vr13,  vr24,  vr25 // q0 + q1
208    vadd.h           vr8,   vr8,   vr10
209    vadd.h           vr9,   vr9,   vr11
210    vsub.h           vr12,  vr12,  vr0
211    vsub.h           vr13,  vr13,  vr1
212    vaddwev.h.bu     vr10,  vr25,  vr26
213    vaddwod.h.bu     vr11,  vr25,  vr26 // q1 + q2
214    vssrlrni.bu.h    vr0,   vr8,   3
215    vssrlrni.bu.h    vr1,   vr9,   3
216    vilvl.b          vr0,   vr1,   vr0  // out p1
217
218    vadd.h           vr8,   vr8,   vr12
219    vadd.h           vr9,   vr9,   vr13
220    vsub.h           vr10,  vr10,  vr2
221    vsub.h           vr11,  vr11,  vr3
222    vaddwev.h.bu     vr12,  vr26,  vr26 // q2 + q2
223    vaddwod.h.bu     vr13,  vr26,  vr26
224    vssrlrni.bu.h    vr1,   vr8,   3
225    vssrlrni.bu.h    vr2,   vr9,   3
226    vilvl.b          vr1,   vr2,   vr1  // out p0
227
228    vadd.h           vr8,   vr8,   vr10
229    vadd.h           vr9,   vr9,   vr11
230    vsub.h           vr12,  vr12,  vr4
231    vsub.h           vr13,  vr13,  vr5
232    vssrlrni.bu.h    vr2,   vr8,   3
233    vssrlrni.bu.h    vr3,   vr9,   3
234    vilvl.b          vr2,   vr3,   vr2  // out q0
235
236    vbitsel.v        vr22,  vr22,  vr0,  vr14
237    vadd.h           vr8,   vr8,   vr12
238    vadd.h           vr9,   vr9,   vr13
239    vbitsel.v        vr23,  vr23,  vr1,  vr14
240    vssrlrni.bu.h    vr3,   vr8,   3
241    vssrlrni.bu.h    vr4,   vr9,   3
242    vilvl.b          vr3,   vr4,   vr3
243    vbitsel.v        vr24,  vr24,  vr2,  vr14
244    vbitsel.v        vr25,  vr25,  vr3,  vr14
245.elseif \wd >= 8
246    vhaddw.qu.du     vr0,   vr14,  vr14
247    vpickve2gr.du    t6,    vr0,   0
248.if \wd == 8
249    beqz             t6,    8f  // skip if there's no flat8in
250.else
251    beqz             t6,    2f  // skip if there's no flat8in
252.endif
253
254    vaddwev.h.bu     vr0,   vr20,  vr21
255    vaddwod.h.bu     vr1,   vr20,  vr21 // p3 + p2
256    vaddwev.h.bu     vr2,   vr22,  vr25
257    vaddwod.h.bu     vr3,   vr22,  vr25 // p1 + q1
258    vaddwev.h.bu     vr4,   vr20,  vr22
259    vaddwod.h.bu     vr5,   vr20,  vr22 // p3 + p1
260    vaddwev.h.bu     vr6,   vr23,  vr26
261    vaddwod.h.bu     vr7,   vr23,  vr26 // p0 + q2
262    vadd.h           vr8,   vr0,   vr0
263    vadd.h           vr9,   vr1,   vr1  // 2 * (p3 + p2)
264    vxor.v           vr10,  vr10,  vr10
265    vaddwev.h.bu     vr11,  vr23,  vr10
266    vaddwod.h.bu     vr12,  vr23,  vr10
267    vaddwev.h.bu     vr13,  vr24,  vr10
268    vaddwod.h.bu     vr10,  vr24,  vr10
269    vadd.h           vr8,   vr8,   vr11 // + p0
270    vadd.h           vr9,   vr9,   vr12
271    vadd.h           vr8,   vr8,   vr13 // + q0
272    vadd.h           vr9,   vr9,   vr10
273    vadd.h           vr8,   vr8,   vr4
274    vadd.h           vr9,   vr9,   vr5  // + p3 + p1
275    vsub.h           vr2,   vr2,   vr0
276    vsub.h           vr3,   vr3,   vr1  // p1 + q1 - p3 - p2
277    vsub.h           vr6,   vr6,   vr4
278    vsub.h           vr7,   vr7,   vr5  // p0 + q2 - p3 - p1
279    vssrlrni.bu.h    vr10,  vr8,   3
280    vssrlrni.bu.h    vr11,  vr9,   3
281    vilvl.b          vr10,  vr11,  vr10 // out p2
282
283    vadd.h           vr8,   vr8,   vr2
284    vadd.h           vr9,   vr9,   vr3
285    vaddwev.h.bu     vr0,   vr20,  vr23
286    vaddwod.h.bu     vr1,   vr20,  vr23 // p3 + p0
287    vaddwev.h.bu     vr2,   vr24,  vr27
288    vaddwod.h.bu     vr3,   vr24,  vr27 // q0 + q3
289    vssrlrni.bu.h    vr11,  vr8,   3
290    vssrlrni.bu.h    vr12,  vr9,   3
291    vilvl.b          vr11,  vr12,  vr11 // out p1
292
293    vadd.h           vr8,   vr8,   vr6
294    vadd.h           vr9,   vr9,   vr7
295    vsub.h           vr2,   vr2,   vr0 // q0 + q3 - p3 - p0
296    vsub.h           vr3,   vr3,   vr1
297    vaddwev.h.bu     vr4,   vr21,  vr24 // p2 + q0
298    vaddwod.h.bu     vr5,   vr21,  vr24
299    vaddwev.h.bu     vr6,   vr25,  vr27 // q1 + q3
300    vaddwod.h.bu     vr7,   vr25,  vr27
301    vssrlrni.bu.h    vr12,  vr8,   3
302    vssrlrni.bu.h    vr13,  vr9,   3
303    vilvl.b          vr12,  vr13,  vr12 // out p0
304
305    vadd.h           vr8,   vr8,   vr2
306    vadd.h           vr9,   vr9,   vr3
307    vsub.h           vr6,   vr6,   vr4 // q1 + q3 - p2 - q0
308    vsub.h           vr7,   vr7,   vr5
309    vaddwev.h.bu     vr0,   vr22,  vr25 // p1 + q1
310    vaddwod.h.bu     vr1,   vr22,  vr25
311    vaddwev.h.bu     vr2,   vr26,  vr27
312    vaddwod.h.bu     vr3,   vr26,  vr27 // q2 + q3
313    vssrlrni.bu.h    vr13,  vr8,   3
314    vssrlrni.bu.h    vr4,   vr9,   3
315    vilvl.b          vr13,  vr4,   vr13 // out q0
316
317    vadd.h           vr8,   vr8,   vr6
318    vadd.h           vr9,   vr9,   vr7
319    vsub.h           vr2,   vr2,   vr0  // q2 + q3 - p1 - q1
320    vsub.h           vr3,   vr3,   vr1
321    vssrlrni.bu.h    vr0,   vr8,   3
322    vssrlrni.bu.h    vr1,   vr9,   3
323    vilvl.b          vr0,   vr1,   vr0 // out q1
324
325    vadd.h           vr8,   vr8,   vr2
326    vadd.h           vr9,   vr9,   vr3
327
328    vbitsel.v        vr21,  vr21,  vr10,  vr14
329    vbitsel.v        vr22,  vr22,  vr11,  vr14
330    vbitsel.v        vr23,  vr23,  vr12,  vr14
331    vbitsel.v        vr24,  vr24,  vr13,  vr14
332    vssrlrni.bu.h    vr1,   vr8,   3
333    vssrlrni.bu.h    vr2,   vr9,   3
334    vilvl.b          vr1,   vr2,  vr1 // out q2
335    vbitsel.v        vr25,  vr25,  vr0,   vr14
336    vbitsel.v        vr26,  vr26,  vr1,   vr14
337.endif
3382:
339.if \wd == 16
340    vhaddw.qu.du     vr2,   vr15,  vr15
341    vpickve2gr.du    t6,    vr2,   0
342    bnez             t6,    1f                 // check if flat8out is needed
343    vhaddw.qu.du     vr2,   vr14,  vr14
344    vpickve2gr.du    t6,    vr2,   0
345    beqz             t6,    8f                 // if there was no flat8in, just write the inner 4 pixels
346    b                7f                        // if flat8in was used, write the inner 6 pixels
3471:
348
349    vaddwev.h.bu     vr2,   vr17,  vr17  // p6 + p6
350    vaddwod.h.bu     vr3,   vr17,  vr17
351    vaddwev.h.bu     vr4,   vr17,  vr18
352    vaddwod.h.bu     vr5,   vr17,  vr18  // p6 + p5
353    vaddwev.h.bu     vr6,   vr17,  vr19
354    vaddwod.h.bu     vr7,   vr17,  vr19  // p6 + p4
355    vaddwev.h.bu     vr8,   vr17,  vr20
356    vaddwod.h.bu     vr9,   vr17,  vr20  // p6 + p3
357    vadd.h           vr12,  vr2,   vr4
358    vadd.h           vr13,  vr3,   vr5
359    vadd.h           vr10,  vr6,   vr8
360    vadd.h           vr11,  vr7,   vr9
361    vaddwev.h.bu     vr6,   vr17,  vr21
362    vaddwod.h.bu     vr7,   vr17,  vr21  // p6 + p2
363    vadd.h           vr12,  vr12,  vr10
364    vadd.h           vr13,  vr13,  vr11
365    vaddwev.h.bu     vr8,   vr17,  vr22
366    vaddwod.h.bu     vr9,   vr17,  vr22  // p6 + p1
367    vaddwev.h.bu     vr10,  vr18,  vr23
368    vaddwod.h.bu     vr11,  vr18,  vr23  // p5 + p0
369    vadd.h           vr6,   vr6,   vr8
370    vadd.h           vr7,   vr7,   vr9
371    vaddwev.h.bu     vr8,   vr19,  vr24
372    vaddwod.h.bu     vr9,   vr19,  vr24  // p4 + q0
373    vadd.h           vr12,  vr12,  vr6
374    vadd.h           vr13,  vr13,  vr7
375    vadd.h           vr10,  vr10,  vr8
376    vadd.h           vr11,  vr11,  vr9
377    vaddwev.h.bu     vr6,   vr20,  vr25
378    vaddwod.h.bu     vr7,   vr20,  vr25  // p3 + q1
379    vadd.h           vr12,  vr12,  vr10
380    vadd.h           vr13,  vr13,  vr11
381    vsub.h           vr6,   vr6,   vr2
382    vsub.h           vr7,   vr7,   vr3
383    vaddwev.h.bu     vr2,   vr21,  vr26
384    vaddwod.h.bu     vr3,   vr21,  vr26  // p2 + q2
385    vssrlrni.bu.h    vr0,   vr12,  4
386    vssrlrni.bu.h    vr1,   vr13,  4
387    vilvl.b          vr0,   vr1,   vr0   // out p5
388    vadd.h           vr12,  vr12,  vr6
389    vadd.h           vr13,  vr13,  vr7   // - (p6 + p6) + (p3 + q1)
390    vsub.h           vr2,   vr2,   vr4
391    vsub.h           vr3,   vr3,   vr5
392    vaddwev.h.bu     vr4,   vr22,  vr27
393    vaddwod.h.bu     vr5,   vr22,  vr27  // p1 + q3
394    vaddwev.h.bu     vr6,   vr17,  vr19
395    vaddwod.h.bu     vr7,   vr17,  vr19  // p6 + p4
396    vssrlrni.bu.h    vr1,   vr12,  4
397    vssrlrni.bu.h    vr8,   vr13,  4
398    vilvl.b          vr1,   vr8,   vr1   // out p4
399    vadd.h           vr12,  vr12,  vr2
400    vadd.h           vr13,  vr13,  vr3   // - (p6 + p5) + (p2 + q2)
401    vsub.h           vr4,   vr4,   vr6
402    vsub.h           vr5,   vr5,   vr7
403    vaddwev.h.bu     vr6,   vr23,  vr28
404    vaddwod.h.bu     vr7,   vr23,  vr28  // p0 + q4
405    vaddwev.h.bu     vr8,   vr17,  vr20
406    vaddwod.h.bu     vr9,   vr17,  vr20  // p6 + p3
407    vssrlrni.bu.h    vr2,   vr12,  4
408    vssrlrni.bu.h    vr10,  vr13,  4
409    vilvl.b          vr2,   vr10,  vr2   // out p3
410    vadd.h           vr12,  vr12,  vr4
411    vadd.h           vr13,  vr13,  vr5   // - (p6 + p4) + (p1 + q3)
412    vsub.h           vr6,   vr6,   vr8
413    vsub.h           vr7,   vr7,   vr9
414    vaddwev.h.bu     vr8,   vr24,  vr29
415    vaddwod.h.bu     vr9,   vr24,  vr29  // q0 + q5
416    vaddwev.h.bu     vr4,   vr17,  vr21
417    vaddwod.h.bu     vr5,   vr17,  vr21  // p6 + p2
418    vssrlrni.bu.h    vr3,   vr12,  4
419    vssrlrni.bu.h    vr11,  vr13,  4
420    vilvl.b          vr3,   vr11,  vr3   // out p2
421    vadd.h           vr12,  vr12,  vr6
422    vadd.h           vr13,  vr13,  vr7   // - (p6 + p3) + (p0 + q4)
423    vsub.h           vr8,   vr8,   vr4
424    vsub.h           vr9,   vr9,   vr5
425    vaddwev.h.bu     vr6,   vr25,  vr30
426    vaddwod.h.bu     vr7,   vr25,  vr30  // q1 + q6
427    vaddwev.h.bu     vr10,  vr17,  vr22
428    vaddwod.h.bu     vr11,  vr17,  vr22  // p6 + p1
429    vssrlrni.bu.h    vr4,   vr12,  4
430    vssrlrni.bu.h    vr5,   vr13,  4
431    vilvl.b          vr4,   vr5,   vr4   // out p1
432
433    vadd.h           vr12,  vr12,  vr8
434    vadd.h           vr13,  vr13,  vr9   // - (p6 + p2) + (q0 + q5)
435    vsub.h           vr6,   vr6,   vr10
436    vsub.h           vr7,   vr7,   vr11
437    vaddwev.h.bu     vr8,   vr26,  vr30
438    vaddwod.h.bu     vr9,   vr26,  vr30  // q2 + q6
439    vbitsel.v        vr0,   vr18,  vr0,  vr15  // out p5
440    vaddwev.h.bu     vr10,  vr18,  vr23
441    vaddwod.h.bu     vr11,  vr18,  vr23  // p5 + p0
442    vssrlrni.bu.h    vr5,   vr12,  4
443    vssrlrni.bu.h    vr18,  vr13,  4
444    vilvl.b          vr5,   vr18,  vr5   // out p0
445
446    vadd.h           vr12,  vr12,  vr6
447    vadd.h           vr13,  vr13,  vr7   // - (p6 + p1) + (q1 + q6)
448    vsub.h           vr8,   vr8,   vr10
449    vsub.h           vr9,   vr9,   vr11
450    vaddwev.h.bu     vr10,  vr27,  vr30
451    vaddwod.h.bu     vr11,  vr27,  vr30  // q3 + q6
452    vbitsel.v        vr1,   vr19,  vr1,  vr15  // out p4
453
454    vaddwev.h.bu     vr18,  vr19,  vr24
455    vaddwod.h.bu     vr19,  vr19,  vr24  // p4 + q0
456    vssrlrni.bu.h    vr6,   vr12,  4
457    vssrlrni.bu.h    vr7,   vr13,  4
458    vilvl.b          vr6,   vr7,   vr6   // out q0
459
460    vadd.h           vr12,  vr12,  vr8
461    vadd.h           vr13,  vr13,  vr9   // - (p5 + p0) + (q2 + q6)
462    vsub.h           vr10,  vr10,  vr18
463    vsub.h           vr11,  vr11,  vr19
464    vaddwev.h.bu     vr8,   vr28,  vr30
465    vaddwod.h.bu     vr9,   vr28,  vr30  // q4 + q6
466    vbitsel.v        vr2,   vr20,  vr2,  vr15  // out p3
467    vaddwev.h.bu     vr18,  vr20,  vr25
468    vaddwod.h.bu     vr19,  vr20,  vr25  // p3 + q1
469    vssrlrni.bu.h    vr7,   vr12,  4
470    vssrlrni.bu.h    vr20,  vr13,  4
471    vilvl.b          vr7,   vr20,  vr7   // out q1
472
473    vadd.h           vr12,  vr12,  vr10
474    vadd.h           vr13,  vr13,  vr11  // - (p4 + q0) + (q3 + q6)
475    vsub.h           vr18,  vr8,   vr18
476    vsub.h           vr19,  vr9,   vr19
477    vaddwev.h.bu     vr10,  vr29,  vr30
478    vaddwod.h.bu     vr11,  vr29,  vr30  // q5 + q6
479    vbitsel.v        vr3,   vr21,  vr3,  vr15  // out p2
480    vaddwev.h.bu     vr20,  vr21,  vr26
481    vaddwod.h.bu     vr21,  vr21,  vr26  // p2 + q2
482    vssrlrni.bu.h    vr8,   vr12,  4
483    vssrlrni.bu.h    vr9,   vr13,  4
484    vilvl.b          vr8,   vr9,   vr8   // out q2
485
486    vadd.h           vr12,  vr12,  vr18
487    vadd.h           vr13,  vr13,  vr19  // - (p3 + q1) + (q4 + q6)
488    vsub.h           vr10,  vr10,  vr20
489    vsub.h           vr11,  vr11,  vr21
490    vaddwev.h.bu     vr18,  vr30,  vr30
491    vaddwod.h.bu     vr19,  vr30,  vr30  // q6 + q6
492    vbitsel.v        vr4,   vr22,  vr4,  vr15  // out p1
493    vaddwev.h.bu     vr20,  vr22,  vr27
494    vaddwod.h.bu     vr21,  vr22,  vr27  // p1 + q3
495    vssrlrni.bu.h    vr9,   vr12,  4
496    vssrlrni.bu.h    vr22,  vr13,  4
497    vilvl.b          vr9,   vr22,  vr9   // out q3
498    vadd.h           vr12,  vr12,  vr10
499    vadd.h           vr13,  vr13,  vr11  // - (p2 + q2) + (q5 + q6)
500    vsub.h           vr18,  vr18,  vr20
501    vsub.h           vr19,  vr19,  vr21
502    vbitsel.v        vr5,   vr23,  vr5,  vr15  // out p0
503    vssrlrni.bu.h    vr10,  vr12,  4
504    vssrlrni.bu.h    vr23,  vr13,  4
505    vilvl.b          vr10,  vr23,  vr10  // out q4
506    vadd.h           vr12,  vr12,  vr18
507    vadd.h           vr13,  vr13,  vr19  // - (p1 + q3) + (q6 + q6)
508    vssrlrni.bu.h    vr11,  vr12,  4
509    vssrlrni.bu.h    vr12,  vr13,  4
510    vilvl.b          vr11,  vr12,  vr11  // out q5
511    vbitsel.v        vr6,   vr24,  vr6,  vr15
512    vbitsel.v        vr7,   vr25,  vr7,  vr15
513    vbitsel.v        vr8,   vr26,  vr8,  vr15
514    vbitsel.v        vr9,   vr27,  vr9,  vr15
515    vbitsel.v        vr10,  vr28,  vr10, vr15
516    vbitsel.v        vr11,  vr29,  vr11, vr15
517.endif
518    li.w             t4,    0
519    jirl             zero,  ra,    0x00
520.if \wd == 16
5217:
522    // Return to a shorter epilogue, writing only the inner 6 pixels
523    li.w             t4,    1 << 6
524    jirl             zero,  ra,    0x00
525.endif
526.if \wd >= 8
5278:
528    // Return to a shorter epilogue, writing only the inner 4 pixels
529    li.w             t4,    1 << 4
530    jirl             zero,  ra,    0x00
531.endif
532endfuncl
533.endm
534
535FILTER 16
536FILTER 8
537FILTER 6
538FILTER 4
539
540.macro LPF_16_WD16
541    move             t7,    ra
542    bl               lpf_16_wd16_lsx
543    move             ra,    t7
544    beqz             t4,    1f
545    andi             t5,    t4,    1 << 6
546    bnez             t5,    7f
547    andi             t5,    t4,    1 << 4
548    bnez             t5,    8f
549    jirl             zero,  ra,    0x00
5501:
551.endm
552
553.macro LPF_16_WD8
554    move             t7,    ra
555    bl               lpf_16_wd8_lsx
556    move             ra,    t7
557    beqz             t4,    1f
558    andi             t5,    t4,    1 << 4
559    bnez             t5,    8f
560    jirl             zero,  ra,    0x00
5611:
562.endm
563
564.macro LPF_16_WD6
565    move             t7,    ra
566    bl               lpf_16_wd6_lsx
567    move             ra,    t7
568    beqz             t4,    1f
569    jirl             zero,  ra,    0x00
5701:
571.endm
572
573.macro LPF_16_WD4
574    move             t7,    ra
575    bl               lpf_16_wd4_lsx
576    move             ra,    t7
577    beqz             t4,    1f
578    jirl             zero,  ra,    0x00
5791:
580.endm
581
582functionl lpf_v_4_16_lsx
583    slli.d           t3,    a1,    1
584    sub.d            t3,    a0,    t3
585    vld              vr22,  t3,    0   // p1
586    vldx             vr23,  t3,    a1  // p0
587    vld              vr24,  a0,    0   // q0
588    vldx             vr25,  a0,    a1  // q1
589
590    LPF_16_WD4
591
592    vst              vr22,  t3,    0   // p1
593    vstx             vr23,  t3,    a1  // p0
594    vst              vr24,  a0,    0   // q0
595    vstx             vr25,  a0,    a1  // q1
596endfuncl
597
598functionl lpf_h_4_16_lsx
599    addi.d           t3,    a0,   -2
600    fld.s            f22,   t3,    0
601    fldx.s           f23,   t3,    a1
602    alsl.d           t3,    a1,    t3,   1
603    fld.s            f24,   t3,    0
604    fldx.s           f25,   t3,    a1
605    alsl.d           t3,    a1,    t3,   1
606    fld.s            f17,   t3,    0
607    fldx.s           f18,   t3,    a1
608    alsl.d           t3,    a1,    t3,   1
609    fld.s            f19,   t3,    0
610    fldx.s           f20,   t3,    a1
611    alsl.d           t3,    a1,    t3,   1
612    vilvl.w          vr22,  vr17,  vr22
613    vilvl.w          vr23,  vr18,  vr23
614    vilvl.w          vr24,  vr19,  vr24
615    vilvl.w          vr25,  vr20,  vr25
616    fld.s            f17,   t3,    0
617    fldx.s           f18,   t3,    a1
618    alsl.d           t3,    a1,    t3,   1
619    fld.s            f19,   t3,    0
620    fldx.s           f20,   t3,    a1
621    alsl.d           t3,    a1,    t3,   1
622    fld.s            f26,   t3,    0
623    fldx.s           f27,   t3,    a1
624    alsl.d           t3,    a1,    t3,   1
625    fld.s            f28,   t3,    0
626    fldx.s           f29,   t3,    a1
627    alsl.d           t3,    a1,    t3,   1
628    vilvl.w          vr17,  vr26,  vr17
629    vilvl.w          vr18,  vr27,  vr18
630    vilvl.w          vr19,  vr28,  vr19
631    vilvl.w          vr20,  vr29,  vr20
632    vilvl.d          vr22,  vr17,  vr22
633    vilvl.d          vr23,  vr18,  vr23
634    vilvl.d          vr24,  vr19,  vr24
635    vilvl.d          vr25,  vr20,  vr25
636    addi.d           a0,    t3,    2
637
638    TRANSPOSE_4x16B  vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
639
640    LPF_16_WD4
641
642    slli.d           t3,    a1,    4
643    sub.d            a0,    a0,    t3
644
645    TRANSPOSE_4x16B  vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
646
647    addi.d           a0,    a0,   -2
648.irp i, vr22, vr23, vr24, vr25
649    vstelm.w         \i,    a0,    0,    0
650    add.d            a0,    a0,    a1
651.endr
652.irp i, vr22, vr23, vr24, vr25
653    vstelm.w         \i,    a0,    0,    1
654    add.d            a0,    a0,    a1
655.endr
656.irp i, vr22, vr23, vr24, vr25
657    vstelm.w         \i,    a0,    0,    2
658    add.d            a0,    a0,    a1
659.endr
660.irp i, vr22, vr23, vr24, vr25
661    vstelm.w         \i,    a0,    0,    3
662    add.d            a0,    a0,    a1
663.endr
664    addi.d           a0,    a0,    2
665endfuncl
666
667functionl lpf_v_6_16_lsx
668    slli.d           t3,    a1,    1
669    sub.d            t3,    a0,    t3
670    sub.d            s0,    t3,    a1
671    vld              vr21,  s0,    0   // p2
672    vldx             vr22,  s0,    a1  // p1
673    alsl.d           s0,    a1,    s0,   1
674    vld              vr23,  s0,    0   // p0
675    vldx             vr24,  s0,    a1  // q0
676    alsl.d           s0,    a1,    s0,   1
677    vld              vr25,  s0,    0   // q1
678    vldx             vr26,  s0,    a1  // q2
679
680    LPF_16_WD6
681
682    vst              vr22,  t3,    0   // p1
683    vstx             vr23,  t3,    a1  // p0
684    vst              vr24,  a0,    0   // q0
685    vstx             vr25,  a0,    a1  // q1
686endfuncl
687
688functionl lpf_h_6_16_lsx
689    addi.d           t3,    a0,   -4
690    fld.d            f20,   t3,    0
691    fldx.d           f21,   t3,    a1
692    alsl.d           t3,    a1,    t3,   1
693    fld.d            f22,   t3,    0
694    fldx.d           f23,   t3,    a1
695    alsl.d           t3,    a1,    t3,   1
696    fld.d            f24,   t3,    0
697    fldx.d           f25,   t3,    a1
698    alsl.d           t3,    a1,    t3,   1
699    fld.d            f26,   t3,    0
700    fldx.d           f27,   t3,    a1
701    alsl.d           t3,    a1,    t3,   1
702    fld.d            f16,   t3,    0
703    fldx.d           f17,   t3,    a1
704    alsl.d           t3,    a1,    t3,   1
705    fld.d            f18,   t3,    0
706    fldx.d           f19,   t3,    a1
707    alsl.d           t3,    a1,    t3,   1
708    fld.d            f28,   t3,    0
709    fldx.d           f29,   t3,    a1
710    alsl.d           t3,    a1,    t3,   1
711    fld.d            f30,   t3,    0
712    fldx.d           f31,   t3,    a1
713    alsl.d           t3,    a1,    t3,   1
714
715    vilvl.d          vr20,  vr16,  vr20
716    vilvl.d          vr21,  vr17,  vr21
717    vilvl.d          vr22,  vr18,  vr22
718    vilvl.d          vr23,  vr19,  vr23
719    vilvl.d          vr24,  vr28,  vr24
720    vilvl.d          vr25,  vr29,  vr25
721    vilvl.d          vr26,  vr30,  vr26
722    vilvl.d          vr27,  vr31,  vr27
723    addi.d           a0,    t3,    4
724
725    TRANSPOSE_8x16B vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
726
727    LPF_16_WD6
728
729    slli.d           t3,    a1,    4
730    sub.d            a0,    a0,    t3
731
732    TRANSPOSE_4x16b vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
733
734    addi.d           a0,    a0,   -2
735.irp i, vr22, vr23, vr24, vr25
736    vstelm.w         \i,    a0,    0,    0
737    add.d            a0,    a0,    a1
738.endr
739.irp i, vr22, vr23, vr24, vr25
740    vstelm.w         \i,    a0,    0,    1
741    add.d            a0,    a0,    a1
742.endr
743.irp i, vr22, vr23, vr24, vr25
744    vstelm.w         \i,    a0,    0,    2
745    add.d            a0,    a0,    a1
746.endr
747.irp i, vr22, vr23, vr24, vr25
748    vstelm.w         \i,    a0,    0,    3
749    add.d            a0,    a0,    a1
750.endr
751    addi.d           a0,    a0,    2
752endfuncl
753
754functionl lpf_v_8_16_lsx
755    slli.d           t3,    a1,    2
756    sub.d            s0,    a0,    t3
757    vld              vr20,  s0,    0   // p3
758    vldx             vr21,  s0,    a1  // p2
759    alsl.d           s0,    a1,    s0,   1
760    vld              vr22,  s0,    0   // p1
761    vldx             vr23,  s0,    a1  // p0
762    alsl.d           s0,    a1,    s0,   1
763    vld              vr24,  s0,    0   // q0
764    vldx             vr25,  s0,    a1  // q1
765    alsl.d           s0,    a1,    s0,   1
766    vld              vr26,  s0,    0   // q2
767    vldx             vr27,  s0,    a1  // q3
768
769    LPF_16_WD8
770
771    sub.d            t3,    a0,    t3
772    add.d            t3,    t3,    a1  // -3
773    vst              vr21,  t3,    0   // p2
774    vstx             vr22,  t3,    a1  // p1
775    alsl.d           t3,    a1,    t3,   1
776    vst              vr23,  t3,    0   // p0
777    vstx             vr24,  t3,    a1  // q0
778    alsl.d           t3,    a1,    t3,   1
779    vst              vr25,  t3,    0   // q1
780    vstx             vr26,  t3,    a1  // q2
781    jirl             zero,  ra,    0x00
7828:
783    slli.d           t3,    a1,    1
784    sub.d            t3,    a0,    t3
785    vst              vr22,  t3,    0   // p1
786    vstx             vr23,  t3,    a1  // p0
787    alsl.d           t3,    a1,    t3,   1
788    vst              vr24,  t3,    0   // q0
789    vstx             vr25,  t3,    a1  // q1
790endfuncl
791
792functionl lpf_h_8_16_lsx
793    addi.d           t3,    a0,   -4
794    fld.d            f20,   t3,    0
795    fldx.d           f21,   t3,    a1
796    alsl.d           t3,    a1,    t3,   1
797    fld.d            f22,   t3,    0
798    fldx.d           f23,   t3,    a1
799    alsl.d           t3,    a1,    t3,   1
800    fld.d            f24,   t3,    0
801    fldx.d           f25,   t3,    a1
802    alsl.d           t3,    a1,    t3,   1
803    fld.d            f26,   t3,    0
804    fldx.d           f27,   t3,    a1
805
806    alsl.d           t3,    a1,    t3,   1
807    fld.d            f16,   t3,    0
808    fldx.d           f17,   t3,    a1
809    alsl.d           t3,    a1,    t3,   1
810    fld.d            f18,   t3,    0
811    fldx.d           f19,   t3,    a1
812    alsl.d           t3,    a1,    t3,   1
813    fld.d            f28,   t3,    0
814    fldx.d           f29,   t3,    a1
815    alsl.d           t3,    a1,    t3,   1
816    fld.d            f30,   t3,    0
817    fldx.d           f31,   t3,    a1
818    alsl.d           t3,    a1,    t3,   1
819
820    vilvl.d          vr20,  vr16,  vr20
821    vilvl.d          vr21,  vr17,  vr21
822    vilvl.d          vr22,  vr18,  vr22
823    vilvl.d          vr23,  vr19,  vr23
824    vilvl.d          vr24,  vr28,  vr24
825    vilvl.d          vr25,  vr29,  vr25
826    vilvl.d          vr26,  vr30,  vr26
827    vilvl.d          vr27,  vr31,  vr27
828    addi.d           a0,    t3,    4
829
830    TRANSPOSE_8x16B vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
831
832    LPF_16_WD8
833
834    slli.d           t3,    a1,    4
835    sub.d            a0,    a0,    t3
836
837    TRANSPOSE_8x16b vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
838
839    addi.d           a0,    a0,   -4
840.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
841    vstelm.d         \i,    a0,    0,    0
842    add.d            a0,    a0,    a1
843.endr
844.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
845    vstelm.d         \i,    a0,    0,    1
846    add.d            a0,    a0,    a1
847.endr
848    addi.d           a0,    a0,    4
849    jirl             zero,  ra,    0x00
850
8518:
852    slli.d           t3,    a1,    4
853    sub.d            a0,    a0,    t3
854
855    TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
856
857    addi.d           a0,    a0,   -2
858.irp i, vr22, vr23, vr24, vr25
859    vstelm.w         \i,    a0,    0,    0
860    add.d            a0,    a0,    a1
861.endr
862.irp i, vr22, vr23, vr24, vr25
863    vstelm.w         \i,    a0,    0,    1
864    add.d            a0,    a0,    a1
865.endr
866.irp i, vr22, vr23, vr24, vr25
867    vstelm.w         \i,    a0,    0,    2
868    add.d            a0,    a0,    a1
869.endr
870.irp i, vr22, vr23, vr24, vr25
871    vstelm.w         \i,    a0,    0,    3
872    add.d            a0,    a0,    a1
873.endr
874    addi.d           a0,    a0,    2
875endfuncl
876
877functionl lpf_v_16_16_lsx
878    slli.d           t3,    a1,    3
879    sub.d            s0,    a0,    t3
880    add.d            s0,    s0,    a1
881    vld              vr17,  s0,    0        // p6
882    vldx             vr18,  s0,    a1       // p5
883    alsl.d           s0,    a1,    s0,   1
884    vld              vr19,  s0,    0        // p4
885    vldx             vr20,  s0,    a1       // p3
886    alsl.d           s0,    a1,    s0,   1
887    vld              vr21,  s0,    0        // p2
888    vldx             vr22,  s0,    a1       // p1
889    alsl.d           s0,    a1,    s0,   1
890    vld              vr23,  s0,    0        // p0
891    vldx             vr24,  s0,    a1       // q0
892    alsl.d           s0,    a1,    s0,   1
893    vld              vr25,  s0,    0        // q1
894    vldx             vr26,  s0,    a1       // q2
895    alsl.d           s0,    a1,    s0,   1
896    vld              vr27,  s0,    0        // q3
897    vldx             vr28,  s0,    a1       // q4
898    alsl.d           s0,    a1,    s0,   1
899    vld              vr29,  s0,    0        // q5
900    vldx             vr30,  s0,    a1       // q6
901
902    LPF_16_WD16
903
904    sub.d            s0,    a0,    t3
905    alsl.d           s0,    a1,    s0,   1
906    vst              vr0,   s0,    0        // p5
907    vstx             vr1,   s0,    a1       // p4
908    alsl.d           s0,    a1,    s0,   1
909    vst              vr2,   s0,    0        // p3
910    vstx             vr3,   s0,    a1       // p2
911    alsl.d           s0,    a1,    s0,   1
912    vst              vr4,   s0,    0        // p1
913    vstx             vr5,   s0,    a1       // p0
914    alsl.d           s0,    a1,    s0,   1
915    vst              vr6,   s0,    0        // q0
916    vstx             vr7,   s0,    a1       // q1
917    alsl.d           s0,    a1,    s0,   1
918    vst              vr8,   s0,    0        // q2
919    vstx             vr9,   s0,    a1       // q3
920    alsl.d           s0,    a1,    s0,   1
921    vst              vr10,  s0,    0        // q4
922    vstx             vr11,  s0,    a1       // q5
923    jirl             zero,  ra,    0x00
9247:
925    slli.d           t3,    a1,    1
926    add.d            t3,    t3,    a1
927    sub.d            s0,    a0,    t3
928    vst              vr21,  s0,    0        // p2
929    vstx             vr22,  s0,    a1       // p1
930    alsl.d           s0,    a1,    s0,   1
931    vst              vr23,  s0,    0        // p0
932    vstx             vr24,  s0,    a1       // q0
933    alsl.d           s0,    a1,    s0,   1
934    vst              vr25,  s0,    0        // q1
935    vstx             vr26,  s0,    a1       // q2
936    jirl             zero,  ra,    0x00
9378:
938    slli.d           t3,    a1,    1
939    sub.d            s0,    a0,    t3
940    vst              vr22,  s0,    0        // p1
941    vstx             vr23,  s0,    a1       // p0
942    alsl.d           s0,    a1,    s0,   1
943    vst              vr24,  s0,    0        // q0
944    vstx             vr25,  s0,    a1       // q1
945endfuncl
946
947functionl lpf_h_16_16_lsx
948    addi.d           t3,    a0,   -8
949    vld              vr16,  t3,    0
950    vldx             vr17,  t3,    a1
951    alsl.d           t3,    a1,    t3,   1
952    vld              vr18,  t3,    0
953    vldx             vr19,  t3,    a1
954    alsl.d           t3,    a1,    t3,   1
955    vld              vr20,  t3,    0
956    vldx             vr21,  t3,    a1
957    alsl.d           t3,    a1,    t3,   1
958    vld              vr22,  t3,    0
959    vldx             vr23,  t3,    a1
960    alsl.d           t3,    a1,    t3,   1
961    vld              vr24,  t3,    0
962    vldx             vr25,  t3,    a1
963    alsl.d           t3,    a1,    t3,   1
964    vld              vr26,  t3,    0
965    vldx             vr27,  t3,    a1
966    alsl.d           t3,    a1,    t3,   1
967    vld              vr28,  t3,    0
968    vldx             vr29,  t3,    a1
969    alsl.d           t3,    a1,    t3,   1
970    vld              vr30,  t3,    0
971    vldx             vr31,  t3,    a1
972    alsl.d           t3,    a1,    t3,   1
973.macro SWAPD in0, in1
974    vaddi.bu         vr0,   \in0,  0
975    vilvl.d          \in0,  \in1,  \in0
976    vilvh.d          \in1,  \in1,  vr0
977.endm
978    SWAPD            vr16,  vr24
979    SWAPD            vr17,  vr25
980    SWAPD            vr18,  vr26
981    SWAPD            vr19,  vr27
982    SWAPD            vr20,  vr28
983    SWAPD            vr21,  vr29
984    SWAPD            vr22,  vr30
985    SWAPD            vr23,  vr31
986    addi.d           a0,    t3,    8
987
988    TRANSPOSE_8x16B vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, vr0, vr1
989    TRANSPOSE_8x16B vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, vr0, vr1
990
991    LPF_16_WD16
992
993    slli.d           t3,    a1,    4
994    sub.d            a0,    a0,    t3
995
996    TRANSPOSE_8x16B vr16, vr17, vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr18, vr19
997    TRANSPOSE_8x16B vr6,  vr7,  vr8,  vr9,  vr10, vr11, vr30, vr31, vr18, vr19
998
999    addi.d           t3,    a0,   -8
1000.irp i, vr16, vr17, vr0, vr1, vr2, vr3, vr4, vr5
1001    vstelm.d         \i,    t3,    0,    0
1002    add.d            t3,    t3,    a1
1003.endr
1004.irp i, vr16, vr17, vr0, vr1, vr2, vr3, vr4, vr5
1005    vstelm.d         \i,    t3,    0,    1
1006    add.d            t3,    t3,    a1
1007.endr
1008.irp i, vr6, vr7, vr8, vr9, vr10, vr11, vr30, vr31
1009    vstelm.d         \i,    a0,    0,    0
1010    add.d            a0,    a0,    a1
1011.endr
1012.irp i, vr6, vr7, vr8, vr9, vr10, vr11, vr30, vr31
1013    vstelm.d         \i,    a0,    0,    1
1014    add.d            a0,    a0,    a1
1015.endr
1016    jirl             zero,  ra,    0x00
1017
10187:
1019    slli.d           t3,    a1,    4
1020    sub.d            a0,    a0,    t3
1021
1022    TRANSPOSE_8x16B vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
1023
1024    addi.d           a0,    a0,   -4
1025.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
1026    vstelm.d         \i,    a0,    0,    0
1027    add.d            a0,    a0,    a1
1028.endr
1029.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
1030    vstelm.d         \i,    a0,    0,    1
1031    add.d            a0,    a0,    a1
1032.endr
1033    addi.d           a0,    a0,    4
1034    jirl             zero,  ra,    0x00
10358:
1036
1037    slli.d           t3,    a1,    4
1038    sub.d            a0,    a0,    t3
1039
1040    TRANSPOSE_4x16B vr22, vr23, vr24, vr25, vr26, vr27, vr28, vr29
1041
1042    addi.d           a0,    a0,   -2
1043.irp i, 0, 1, 2, 3
1044    vstelm.w         vr22,  a0,    0,    \i
1045    add.d            a0,    a0,    a1
1046    vstelm.w         vr23,  a0,    0,    \i
1047    add.d            a0,    a0,    a1
1048    vstelm.w         vr24,  a0,    0,    \i
1049    add.d            a0,    a0,    a1
1050    vstelm.w         vr25,  a0,    0,    \i
1051    add.d            a0,    a0,    a1
1052.endr
1053    addi.d           a0,    a0,    2
1054endfuncl
1055
1056.macro PUSH_REG
1057    addi.d           sp,    sp,   -64-8
1058    fst.d            f24,   sp,    0
1059    fst.d            f25,   sp,    8
1060    fst.d            f26,   sp,    16
1061    fst.d            f27,   sp,    24
1062    fst.d            f28,   sp,    32
1063    fst.d            f29,   sp,    40
1064    fst.d            f30,   sp,    48
1065    fst.d            f31,   sp,    56
1066    st.d             s0,    sp,    64
1067.endm
1068.macro POP_REG
1069    fld.d            f24,   sp,    0
1070    fld.d            f25,   sp,    8
1071    fld.d            f26,   sp,    16
1072    fld.d            f27,   sp,    24
1073    fld.d            f28,   sp,    32
1074    fld.d            f29,   sp,    40
1075    fld.d            f30,   sp,    48
1076    fld.d            f31,   sp,    56
1077    ld.d             s0,    sp,    64
1078    addi.d           sp,    sp,    64+8
1079.endm
1080
1081const mask_1248
1082.word 1, 2, 4, 8
1083endconst
1084
1085.macro LPF_FUNC DIR, TYPE
1086function lpf_\DIR\()_sb_\TYPE\()_8bpc_lsx
1087    PUSH_REG
1088    move             t8,    ra
1089    vld              vr0,   a2,    0 //vmask
1090    vpickve2gr.wu    t0,    vr0,   0
1091    vpickve2gr.wu    t1,    vr0,   1
1092.ifc \TYPE, y
1093    vpickve2gr.wu    t2,    vr0,   2
1094.endif
1095    addi.d           a5,    a5,    128 // Move to sharp part of lut
1096.ifc \TYPE, y
1097    or               t1,    t1,    t2 // vmask[1] |= vmaks[2]
1098.endif
1099    slli.d           a4,    a4,    2
1100.ifc \DIR, v
1101    sub.d            a4,    a3,    a4
1102.else
1103    addi.d           a3,    a3,   -4
1104.endif
1105    or               t0,    t0,    t1 // vmaks[0] |= vmask[1]
11061:
1107    andi             t3,    t0,    0x0f
1108.ifc \DIR, v
1109    vld              vr0,   a4,    0 // l[-b4_stride][]
1110    addi.d           a4,    a4,    16
1111    vld              vr1,   a3,    0 // l[0][]
1112    addi.d           a3,    a3,    16
1113.else
1114    fld.d            f0,    a3,    0
1115    fldx.d           f1,    a3,    a4
1116    alsl.d           a3,    a4,    a3,   1
1117    fld.d            f2,    a3,    0
1118    fldx.d           f3,    a3,    a4
1119    alsl.d           a3,    a4,    a3,   1
1120    vilvl.w          vr1,   vr1,   vr0
1121    vilvl.w          vr2,   vr3,   vr2
1122    vilvl.d          vr0,   vr2,   vr1
1123    vilvh.d          vr1,   vr2,   vr1
1124.endif
1125    beqz             t3,    7f
1126    //l[0][] ? l[0][] : l[-b4_stride][]
1127    vseqi.b          vr2,   vr1,    0
1128    vbitsel.v        vr1,   vr1,   vr0,   vr2
1129    li.w             t3,    0xff
1130    vreplgr2vr.w     vr3,   t3
1131    vand.v           vr1,   vr1,   vr3
1132    vshuf4i.b        vr1,   vr1,   0x00 // L --       1  0  2   0
1133    vseqi.w          vr2,   vr1,   0    //            0 -1  0  -1
1134    vseqi.w          vr2,   vr2,   0    // L != 0 -- -1  0 -1   0
1135    vhaddw.qu.du     vr3,   vr2,   vr2
1136    vpickve2gr.du    t4,    vr3,   0
1137    beqz             t4,    7f          // if (!L) continue
1138    la.local         t3,    mask_1248   // bits x
1139    vld              vr16,  t3,    0
1140    vreplgr2vr.w     vr13,  t0          // vmask[0]
1141    vreplgr2vr.w     vr14,  t1          // vmaks[1]
1142    vand.v           vr13,  vr13,  vr16
1143    vseqi.w          vr13,  vr13,  0
1144    vseqi.w          vr13,  vr13,  0    // if (vmask[0] & x)
1145    vand.v           vr13,  vr13,  vr2  // vmask[0] &= L != 0
1146    vand.v           vr14,  vr14,  vr16
1147    vseqi.w          vr14,  vr14,  0
1148    vseqi.w          vr14,  vr14,  0    // if (vmask[1] & x)
1149.ifc \TYPE, y
1150    vreplgr2vr.w     vr15,  t2          // vmask[2]
1151    vand.v           vr15,  vr15,  vr16
1152    vseqi.w          vr15,  vr15,  0
1153    vseqi.w          vr15,  vr15,  0    // if (vmask[2] & x)
1154.endif
1155    vldrepl.b        vr5,   a5,    0    // sharp[0]
1156    addi.d           t5,    a5,    8
1157    vldrepl.b        vr6,   t5,    0    // sharp[1]
1158    vsrl.b           vr3,   vr1,   vr5  // L >> sharp[0]
1159    vsrli.b          vr12,  vr1,   4    // H
1160    vmin.bu          vr3,   vr3,   vr6  // imin(L >> sharp[0], sharp[1])
1161    vaddi.bu         vr0,   vr1,   2    // L + 2
1162    vmaxi.bu         vr11,  vr3,   1    // imax(imin(), 1) = limit = I
1163    vslli.b          vr0,   vr0,   1    // 2*(L + 2)
1164    vadd.b           vr10,  vr0,   vr11 // 2*(L + 2) + limit = E
1165.ifc \TYPE, y
1166    andi             t3,    t2,    0x0f
1167    beqz             t3,    2f
1168    //wd16
1169    bl               lpf_\DIR\()_16_16_lsx
1170    b                8f
11712:
1172.endif
1173    andi             t3,    t1,    0x0f
1174    beqz             t3,    3f
1175.ifc \TYPE, y
1176    // wd8
1177    bl               lpf_\DIR\()_8_16_lsx
1178.else
1179    // wd6
1180    bl               lpf_\DIR\()_6_16_lsx
1181.endif
1182    b                8f
11833:
1184    // wd4
1185    bl               lpf_\DIR\()_4_16_lsx
1186.ifc \DIR, h
1187    b                8f
11887:
1189    // For dir h, the functions above increment a0.
1190    // If the whole function is skipped, increment it here instead.
1191    alsl.d           a0,    a1,    a0,    4
1192.else
11937:
1194.endif
11958:
1196    srli.d           t0,    t0,    4
1197    srli.d           t1,    t1,    4
1198.ifc \TYPE, y
1199    srli.d           t2,    t2,    4
1200.endif
1201.ifc \DIR, v
1202    addi.d           a0,    a0,    16
1203.else
1204    // For dir h, a0 is returned incremented
1205.endif
1206    bnez             t0,    1b
1207    move             ra,    t8
1208    POP_REG
1209endfunc
1210.endm
1211
1212LPF_FUNC h, y
1213LPF_FUNC v, y
1214LPF_FUNC h, uv
1215LPF_FUNC v, uv
1216