xref: /aosp_15_r20/external/libdav1d/src/loongarch/mc.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2023, VideoLAN and dav1d authors
3 * Copyright © 2023, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/loongarch/loongson_asm.S"
29
30/*
31static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
32                              const pixel *src, const ptrdiff_t src_stride,
33                              const int16_t *const abcd, int mx, int my
34                              HIGHBD_DECL_SUFFIX)
35*/
36.macro vld_filter_row dst, src, inc
37    addi.w          t3,       \src,     512
38    srai.w          t3,       t3,       10
39    add.w           \src,     \src,     \inc
40    addi.w          t3,       t3,       64
41    slli.w          t3,       t3,       3
42    fldx.d          \dst,     t4,       t3
43.endm
44
45.macro warp_filter_horz_lsx
46    addi.w          t5,       a5,       0
47    vld             vr10,     a2,       0
48    add.d           a2,       a2,       a3
49
50    vld_filter_row f0, t5, t0
51    vld_filter_row f1, t5, t0
52    vld_filter_row f2, t5, t0
53    vld_filter_row f3, t5, t0
54    vld_filter_row f4, t5, t0
55    vld_filter_row f5, t5, t0
56    vld_filter_row f6, t5, t0
57    vld_filter_row f7, t5, t0
58
59    vxor.v          vr10,     vr10,     vr20
60
61    vbsrl.v         vr8,      vr10,     1
62    vbsrl.v         vr9,      vr10,     2
63    vilvl.d         vr8,      vr8,      vr10
64    vilvl.d         vr0,      vr1,      vr0
65    vmulwev.h.b     vr11,     vr8,      vr0
66    vmulwod.h.b     vr12,     vr8,      vr0
67    vbsrl.v         vr8,      vr10,     3
68    vbsrl.v         vr19,     vr10,     4
69    vilvl.d         vr8,      vr8,      vr9
70    vilvl.d         vr2,      vr3,      vr2
71    vmulwev.h.b     vr13,     vr8,      vr2
72    vmulwod.h.b     vr14,     vr8,      vr2
73    vbsrl.v         vr8,      vr10,     5
74    vbsrl.v         vr9,      vr10,     6
75    vilvl.d         vr8,      vr8,      vr19
76    vilvl.d         vr4,      vr5,      vr4
77    vmulwev.h.b     vr15,     vr8,      vr4
78    vmulwod.h.b     vr16,     vr8,      vr4
79    vbsrl.v         vr8,      vr10,     7
80    vilvl.d         vr8,      vr8,      vr9
81    vilvl.d         vr6,      vr7,      vr6
82    vmulwev.h.b     vr17,     vr8,      vr6
83    vmulwod.h.b     vr18,     vr8,      vr6
84
85    vadd.h          vr11,     vr11,     vr12
86    vadd.h          vr13,     vr13,     vr14
87    vadd.h          vr15,     vr15,     vr16
88    vadd.h          vr17,     vr17,     vr18
89    vpickev.h       vr12,     vr13,     vr11
90    vpickod.h       vr14,     vr13,     vr11
91    vpickev.h       vr16,     vr17,     vr15
92    vpickod.h       vr18,     vr17,     vr15
93    vadd.h          vr11,     vr12,     vr14
94    vadd.h          vr15,     vr16,     vr18
95    vpickev.h       vr12,     vr15,     vr11
96    vpickod.h       vr14,     vr15,     vr11
97    vadd.h          vr11,     vr12,     vr14
98
99    add.d           a5,       a5,       t1
100.endm
101
102.macro transpose_8x8b_extend_lsx in0, in1, in2, in3, in4, in5, in6, in7
103    vilvl.b         \in0,     \in1,     \in0
104    vilvl.b         \in2,     \in3,     \in2
105    vilvl.b         \in4,     \in5,     \in4
106    vilvl.b         \in6,     \in7,     \in6
107
108    vpackev.h       \in1,     \in2,     \in0
109    vpackod.h       \in3,     \in2,     \in0
110    vpackev.h       \in5,     \in6,     \in4
111    vpackod.h       \in7,     \in6,     \in4
112
113    vpackev.w       \in0,     \in5,     \in1
114    vpackod.w       \in2,     \in5,     \in1
115    vpackev.w       \in1,     \in7,     \in3
116    vpackod.w       \in3,     \in7,     \in3
117
118    vexth.h.b       \in4,     \in0
119    vsllwil.h.b     \in0,     \in0,     0
120    vexth.h.b       \in5,     \in1
121    vsllwil.h.b     \in1,     \in1,     0
122    vexth.h.b       \in6,     \in2
123    vsllwil.h.b     \in2,     \in2,     0
124    vexth.h.b       \in7,     \in3
125    vsllwil.h.b     \in3,     \in3,     0
126.endm
127
128.macro warp t, shift
129function warp_affine_8x8\t\()_8bpc_lsx
130    addi.d          sp,       sp,      -64
131    fst.d           f24,      sp,      0
132    fst.d           f25,      sp,      8
133    fst.d           f26,      sp,      16
134    fst.d           f27,      sp,      24
135    fst.d           f28,      sp,      32
136    fst.d           f29,      sp,      40
137    fst.d           f30,      sp,      48
138    fst.d           f31,      sp,      56
139
140    ld.h            t0,       a4,      0
141    ld.h            t1,       a4,      2
142    ld.h            t2,       a4,      4
143    ld.h            a4,       a4,      6
144
145    li.d            t7,       8
146    alsl.w          t3,       a3,      a3,     1
147    sub.d           a2,       a2,      t3
148    addi.d          a2,       a2,      -3
149    la.local        t4,       dav1d_mc_warp_filter
150
151.ifnb \t
152    slli.d          a1,       a1,      1
153.endif
154
155    li.w            t3,       128
156    vreplgr2vr.b    vr20,     t3
157.ifb \t
158    vreplgr2vr.h    vr21,     t3
159.else
160    li.w            t3,       2048
161    vreplgr2vr.h    vr21,     t3
162.endif
163    warp_filter_horz_lsx
164    vsrari.h        vr24,     vr11,    3
165    warp_filter_horz_lsx
166    vsrari.h        vr25,     vr11,    3
167    warp_filter_horz_lsx
168    vsrari.h        vr26,     vr11,    3
169    warp_filter_horz_lsx
170    vsrari.h        vr27,     vr11,    3
171    warp_filter_horz_lsx
172    vsrari.h        vr28,     vr11,    3
173    warp_filter_horz_lsx
174    vsrari.h        vr29,     vr11,    3
175    warp_filter_horz_lsx
176    vsrari.h        vr30,     vr11,    3
177
1781:
179    addi.d          t6,       a6,      0
180    warp_filter_horz_lsx
181    vsrari.h        vr31,     vr11,    3
182
183    vld_filter_row f0, t6, t2
184    vld_filter_row f1, t6, t2
185    vld_filter_row f2, t6, t2
186    vld_filter_row f3, t6, t2
187    vld_filter_row f4, t6, t2
188    vld_filter_row f5, t6, t2
189    vld_filter_row f6, t6, t2
190    vld_filter_row f7, t6, t2
191
192    transpose_8x8b_extend_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
193
194    vmulwev.w.h     vr16,     vr24,    vr0
195    vmulwod.w.h     vr17,     vr24,    vr0
196    vmaddwev.w.h    vr16,     vr25,    vr1
197    vmaddwod.w.h    vr17,     vr25,    vr1
198    vmaddwev.w.h    vr16,     vr26,    vr2
199    vmaddwod.w.h    vr17,     vr26,    vr2
200    vmaddwev.w.h    vr16,     vr27,    vr3
201    vmaddwod.w.h    vr17,     vr27,    vr3
202    vmaddwev.w.h    vr16,     vr28,    vr4
203    vmaddwod.w.h    vr17,     vr28,    vr4
204    vmaddwev.w.h    vr16,     vr29,    vr5
205    vmaddwod.w.h    vr17,     vr29,    vr5
206    vmaddwev.w.h    vr16,     vr30,    vr6
207    vmaddwod.w.h    vr17,     vr30,    vr6
208    vmaddwev.w.h    vr16,     vr31,    vr7
209    vmaddwod.w.h    vr17,     vr31,    vr7
210
211    vssrarni.h.w    vr16,     vr16,    \shift
212    vssrarni.h.w    vr17,     vr17,    \shift
213    vilvl.h         vr16,     vr17,    vr16
214    vadd.h          vr16,     vr16,    vr21
215
216    vor.v           vr24,     vr25,    vr25
217    vor.v           vr25,     vr26,    vr26
218    vor.v           vr26,     vr27,    vr27
219    vor.v           vr27,     vr28,    vr28
220    vor.v           vr28,     vr29,    vr29
221    vor.v           vr29,     vr30,    vr30
222    vor.v           vr30,     vr31,    vr31
223
224.ifb \t
225    vssrarni.bu.h   vr16,     vr16,    0
226.endif
227
228    addi.d          t7,       t7,      -1
229.ifnb \t
230    vst             vr16,     a0,      0
231.else
232    vstelm.d        vr16,     a0,      0,   0
233.endif
234    add.d           a0,       a1,      a0
235
236    add.d           a6,       a6,      a4
237    blt             zero,     t7,      1b
238
239    fld.d           f24,      sp,      0
240    fld.d           f25,      sp,      8
241    fld.d           f26,      sp,      16
242    fld.d           f27,      sp,      24
243    fld.d           f28,      sp,      32
244    fld.d           f29,      sp,      40
245    fld.d           f30,      sp,      48
246    fld.d           f31,      sp,      56
247    addi.d          sp,       sp,      64
248endfunc
249.endm
250
251warp  , 11
252warp t, 7
253
254.macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3
255    xvshuf.b        xr2,    \in0,     \in0,     \in2
256
257    addi.w          t4,     \in1,     512
258    srai.w          t4,     t4,       10
259    addi.w          t4,     t4,       64
260    slli.w          t4,     t4,       3
261    vldx            vr3,    t5,       t4
262    add.w           t3,     t3,       t0   // tmx += abcd[0]
263
264    addi.w          t4,     t3,       512
265    srai.w          t4,     t4,       10
266    addi.w          t4,     t4,       64
267    slli.w          t4,     t4,       3
268    vldx            vr4,    t5,       t4
269    add.w           t3,     t3,       t0   // tmx += abcd[0]
270
271    addi.w          t4,     t3,       512
272    srai.w          t4,     t4,       10
273    addi.w          t4,     t4,       64
274    slli.w          t4,     t4,       3
275    vldx            vr5,    t5,       t4
276    add.w           t3,     t3,       t0   // tmx += abcd[0]
277
278    addi.w          t4,     t3,       512
279    srai.w          t4,     t4,       10
280    addi.w          t4,     t4,       64
281    slli.w          t4,     t4,       3
282    vldx            vr6,    t5,       t4
283    add.w           t3,     t3,       t0   // tmx += abcd[0]
284
285    xvinsve0.d      xr3,    xr5,      1
286    xvinsve0.d      xr3,    xr4,      2
287    xvinsve0.d      xr3,    xr6,      3
288
289    xvmulwev.h.bu.b xr4,    xr2,      xr3
290    xvmulwod.h.bu.b xr5,    xr2,      xr3
291    xvilvl.d        xr2,    xr5,      xr4
292    xvilvh.d        xr3,    xr5,      xr4
293    xvhaddw.w.h     xr2,    xr2,      xr2
294    xvhaddw.w.h     xr3,    xr3,      xr3
295    xvhaddw.d.w     xr2,    xr2,      xr2
296    xvhaddw.d.w     xr3,    xr3,      xr3
297    xvhaddw.q.d     xr2,    xr2,      xr2
298    xvhaddw.q.d     xr3,    xr3,      xr3
299
300    xvextrins.w     \out0,  xr2,      \out1
301    xvextrins.w     \out2,  xr3,      \out3
302.endm
303
304.macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1
305    add.w           \in0,     \in0,    \in1
306    addi.w          t6,       \in0,    512
307    srai.w          t6,       t6,      10
308    addi.w          t6,       t6,      64
309    slli.w          t6,       t6,      3
310    fldx.d          f1,       t5,      t6
311
312    add.w           t2,       t2,      t7
313    addi.w          t6,       t2,      512
314    srai.w          t6,       t6,      10
315    addi.w          t6,       t6,      64
316    slli.w          t6,       t6,      3
317    fldx.d          f2,       t5,      t6
318
319    vilvl.d         vr0,      vr2,     vr1
320    vext2xv.h.b     xr0,      xr0
321    xvmulwev.w.h    xr3,      \in2,    xr0
322    xvmaddwod.w.h   xr3,      \in2,    xr0
323    xvhaddw.d.w     xr3,      xr3,     xr3
324    xvhaddw.q.d     xr3,      xr3,     xr3
325    xvextrins.w     \out0,    xr3,     \out1
326.endm
327
328const shuf0
329.byte  0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
330.byte  1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10
331endconst
332
333const warp_sh
334.rept 2
335.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
336.endr
337.rept 2
338.byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
339.endr
340endconst
341
342.macro warp_lasx t, shift
343function warp_affine_8x8\t\()_8bpc_lasx
344    addi.d          sp,       sp,      -16
345    ld.h            t0,       a4,      0   // abcd[0]
346    ld.h            t1,       a4,      2   // abcd[1]
347    fst.d           f24,      sp,      0
348    fst.d           f25,      sp,      8
349
350    alsl.w          t2,       a3,      a3,     1
351    addi.w          t3,       a5,      0
352    la.local        t4,       warp_sh
353    la.local        t5,       dav1d_mc_warp_filter
354    sub.d           a2,       a2,      t2
355    addi.d          a2,       a2,      -3
356    vld             vr0,      a2,      0
357    xvld            xr24,     t4,      0
358    xvld            xr25,     t4,      32
359    la.local        t2,       shuf0
360    xvld            xr1,      t2,      0
361    xvpermi.q       xr0,      xr0,     0x00
362    xvaddi.bu        xr9,    xr1,      4
363    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
364    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
365
366    add.w           a5,       a5,      t1
367    or              t3,       a5,      a5
368    add.d           a2,       a2,      a3
369    vld             vr0,      a2,      0
370    xvpermi.q       xr0,      xr0,     0x00
371    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
372    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
373
374    add.w           a5,       a5,      t1
375    or              t3,       a5,      a5
376    add.d           a2,       a2,      a3
377    vld             vr0,      a2,      0
378    xvpermi.q       xr0,      xr0,     0x00
379    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
380    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
381
382    add.w           a5,       a5,      t1
383    or              t3,       a5,      a5
384    add.d           a2,       a2,      a3
385    vld             vr0,      a2,      0
386    xvpermi.q       xr0,      xr0,     0x00
387    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
388    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
389
390    add.w           a5,       a5,      t1
391    or              t3,       a5,      a5
392    add.d           a2,       a2,      a3
393    vld             vr0,      a2,      0
394    xvpermi.q       xr0,      xr0,     0x00
395    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00
396    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00
397
398    add.w           a5,       a5,      t1
399    or              t3,       a5,      a5
400    add.d           a2,       a2,      a3
401    vld             vr0,      a2,      0
402    xvpermi.q       xr0,      xr0,     0x00
403    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10
404    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10
405
406    add.w           a5,       a5,      t1
407    or              t3,       a5,      a5
408    add.d           a2,       a2,      a3
409    vld             vr0,      a2,      0
410    xvpermi.q       xr0,      xr0,     0x00
411    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20
412    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20
413
414    add.w           a5,       a5,      t1
415    or              t3,       a5,      a5
416    add.d           a2,       a2,      a3
417    vld             vr0,      a2,      0
418    xvpermi.q       xr0,      xr0,     0x00
419    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30
420    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30
421
422    xvsrarni.h.w    xr12,     xr7,     3
423    xvsrarni.h.w    xr13,     xr8,     3
424    xvsrarni.h.w    xr14,     xr10,    3
425    xvsrarni.h.w    xr15,     xr11,    3
426
427    add.w           a5,       a5,      t1
428    or              t3,       a5,      a5
429    add.d           a2,       a2,      a3
430    vld             vr0,      a2,      0
431    xvpermi.q       xr0,      xr0,     0x00
432    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
433    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
434
435    add.w           a5,       a5,      t1
436    or              t3,       a5,      a5
437    add.d           a2,       a2,      a3
438    vld             vr0,      a2,      0
439    xvpermi.q       xr0,      xr0,     0x00
440    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
441    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
442
443    add.w           a5,       a5,      t1
444    or              t3,       a5,      a5
445    add.d           a2,       a2,      a3
446    vld             vr0,      a2,      0
447    xvpermi.q       xr0,      xr0,     0x00
448    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
449    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
450
451    add.w           a5,       a5,      t1
452    or              t3,       a5,      a5
453    add.d           a2,       a2,      a3
454    vld             vr0,      a2,      0
455    xvpermi.q       xr0,      xr0,     0x00
456    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
457    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
458
459    add.w           a5,       a5,      t1
460    or              t3,       a5,      a5
461    add.d           a2,       a2,      a3
462    vld             vr0,      a2,      0
463    xvpermi.q       xr0,      xr0,     0x00
464    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00
465    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00
466
467    add.w           a5,       a5,      t1
468    or              t3,       a5,      a5
469    add.d           a2,       a2,      a3
470    vld             vr0,      a2,      0
471    xvpermi.q       xr0,      xr0,     0x00
472    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10
473    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10
474
475    add.w           a5,       a5,      t1
476    or              t3,       a5,      a5
477    add.d           a2,       a2,      a3
478    vld             vr0,      a2,      0
479    xvpermi.q       xr0,      xr0,     0x00
480    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20
481    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20
482
483    xvsrarni.h.w    xr16,     xr7,     3
484    xvsrarni.h.w    xr17,     xr8,     3
485    xvsrarni.h.w    xr18,     xr10,    3
486    xvsrarni.h.w    xr19,     xr11,    3
487
488    addi.w          t2,       a6,      0   // my
489    ld.h            t7,       a4,      4   // abcd[2]
490    ld.h            t8,       a4,      6   // abcd[3]
491
492.ifnb \t
493    slli.d          a1,       a1,      1
494.endif
495
496    // y = 0
497    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr20, 0x00
498    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr20, 0x10
499    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr20, 0x20
500    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr20, 0x30
501
502    xvshuf.b         xr12,     xr16,    xr12,   xr24
503    xvshuf.b         xr13,     xr17,    xr13,   xr24
504    xvshuf.b         xr14,     xr18,    xr14,   xr24
505    xvshuf.b         xr15,     xr19,    xr15,   xr24
506    xvextrins.h      xr24,     xr25,    0x70
507
508    add.w           a6,       a6,      t8
509    addi.w          t2,       a6,      0
510    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr21, 0x00
511    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr21, 0x10
512    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr21, 0x20
513    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr21, 0x30
514
515.ifnb \t
516    xvssrarni.h.w   xr21,     xr20,     \shift
517    xvpermi.q       xr22,     xr21,     0x01
518    vilvl.h         vr23,     vr22,     vr21
519    vilvh.h         vr21,     vr22,     vr21
520    vst             vr23,     a0,       0
521    vstx            vr21,     a0,       a1
522.else
523    xvssrarni.hu.w   xr21,    xr20,     \shift
524    xvssrlni.bu.h    xr22,    xr21,     0
525    xvpermi.q        xr23,    xr22,     0x01
526    vilvl.b          vr21,    vr23,     vr22
527    fst.d            f21,     a0,       0
528    add.d            a0,      a0,       a1
529    vstelm.d         vr21,    a0,       0,     1
530.endif
531
532    xvaddi.bu        xr25,     xr25,    2
533    xvshuf.b         xr12,     xr16,    xr12,   xr24
534    xvshuf.b         xr13,     xr17,    xr13,   xr24
535    xvshuf.b         xr14,     xr18,    xr14,   xr24
536    xvshuf.b         xr15,     xr19,    xr15,   xr24
537    xvextrins.h      xr24,     xr25,    0x70
538
539    add.w           a6,       a6,      t8
540    addi.w          t2,       a6,      0
541    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr20, 0x00
542    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr20, 0x10
543    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr20, 0x20
544    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr20, 0x30
545
546    xvaddi.bu        xr25,     xr25,    2
547    xvshuf.b         xr12,     xr16,    xr12,   xr24
548    xvshuf.b         xr13,     xr17,    xr13,   xr24
549    xvshuf.b         xr14,     xr18,    xr14,   xr24
550    xvshuf.b         xr15,     xr19,    xr15,   xr24
551    xvextrins.h      xr24,     xr25,    0x70
552
553    add.w           a6,       a6,      t8
554    addi.w          t2,       a6,      0
555    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr21, 0x00
556    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr21, 0x10
557    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr21, 0x20
558    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr21, 0x30
559
560.ifnb \t
561    xvssrarni.h.w   xr21,     xr20,     \shift
562    alsl.d          a0,       a1,       a0,     1
563    xvpermi.q       xr22,     xr21,     0x01
564    vilvl.h         vr23,     vr22,     vr21
565    vilvh.h         vr21,     vr22,     vr21
566    vst             vr23,     a0,       0
567    vstx            vr21,     a0,       a1
568.else
569    xvssrarni.hu.w   xr21,    xr20,     11
570    xvssrlni.bu.h    xr22,    xr21,     0
571    xvpermi.q        xr23,    xr22,     0x01
572    vilvl.b          vr21,    vr23,     vr22
573    add.d            a0,      a0,       a1
574    fst.d            f21,     a0,       0
575    add.d            a0,      a0,       a1
576    vstelm.d         vr21,    a0,       0,     1
577.endif
578
579    xvaddi.bu        xr25,     xr25,    2
580    xvshuf.b         xr12,     xr16,    xr12,   xr24
581    xvshuf.b         xr13,     xr17,    xr13,   xr24
582    xvshuf.b         xr14,     xr18,    xr14,   xr24
583    xvshuf.b         xr15,     xr19,    xr15,   xr24
584    xvextrins.h      xr24,     xr25,    0x70
585
586    add.w           a6,       a6,      t8
587    addi.w          t2,       a6,      0
588    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr20, 0x00
589    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr20, 0x10
590    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr20, 0x20
591    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr20, 0x30
592
593    xvaddi.bu        xr25,     xr25,    2
594    xvshuf.b         xr12,     xr16,    xr12,   xr24
595    xvshuf.b         xr13,     xr17,    xr13,   xr24
596    xvshuf.b         xr14,     xr18,    xr14,   xr24
597    xvshuf.b         xr15,     xr19,    xr15,   xr24
598    xvextrins.h      xr24,     xr25,    0x70
599
600    add.w           a6,       a6,      t8
601    addi.w          t2,       a6,      0
602    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr21, 0x00
603    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr21, 0x10
604    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr21, 0x20
605    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr21, 0x30
606
607.ifnb \t
608    xvssrarni.h.w   xr21,     xr20,     \shift
609    alsl.d          a0,       a1,       a0,     1
610    xvpermi.q       xr22,     xr21,     0x01
611    vilvl.h         vr23,     vr22,     vr21
612    vilvh.h         vr21,     vr22,     vr21
613    vst             vr23,     a0,       0
614    vstx            vr21,     a0,       a1
615.else
616    xvssrarni.hu.w   xr21,    xr20,     11
617    xvssrlni.bu.h    xr22,    xr21,     0
618    xvpermi.q        xr23,    xr22,     0x01
619    vilvl.b          vr21,    vr23,     vr22
620    add.d            a0,      a0,       a1
621    fst.d            f21,     a0,       0
622    add.d            a0,      a0,       a1
623    vstelm.d         vr21,    a0,       0,     1
624.endif
625
626    xvaddi.bu        xr25,     xr25,    2
627    xvshuf.b         xr12,     xr16,    xr12,   xr24
628    xvshuf.b         xr13,     xr17,    xr13,   xr24
629    xvshuf.b         xr14,     xr18,    xr14,   xr24
630    xvshuf.b         xr15,     xr19,    xr15,   xr24
631    xvextrins.h      xr24,     xr25,    0x70
632
633    add.w           a6,       a6,      t8
634    addi.w          t2,       a6,      0
635    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr20, 0x00
636    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr20, 0x10
637    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr20, 0x20
638    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr20, 0x30
639
640    xvshuf.b         xr12,     xr16,    xr12,   xr24
641    xvshuf.b         xr13,     xr17,    xr13,   xr24
642    xvshuf.b         xr14,     xr18,    xr14,   xr24
643    xvshuf.b         xr15,     xr19,    xr15,   xr24
644
645    add.w           a6,       a6,      t8
646    addi.w          t2,       a6,      0
647    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr21, 0x00
648    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr21, 0x10
649    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr21, 0x20
650    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr21, 0x30
651
652.ifnb \t
653    xvssrarni.h.w   xr21,     xr20,     \shift
654    alsl.d          a0,       a1,       a0,     1
655    xvpermi.q       xr22,     xr21,     0x01
656    vilvl.h         vr23,     vr22,     vr21
657    vilvh.h         vr21,     vr22,     vr21
658    vst             vr23,     a0,       0
659    vstx            vr21,     a0,       a1
660.else
661    xvssrarni.hu.w   xr21,    xr20,     11
662    xvssrlni.bu.h    xr22,    xr21,     0
663    xvpermi.q        xr23,    xr22,     0x01
664    vilvl.b          vr21,    vr23,     vr22
665    add.d            a0,      a0,       a1
666    fst.d            f21,     a0,       0
667    add.d            a0,      a0,       a1
668    vstelm.d         vr21,    a0,       0,     1
669.endif
670    fld.d            f24,     sp,       0
671    fld.d            f25,     sp,       8
672    addi.d           sp,      sp,       16
673endfunc
674.endm
675
676warp_lasx , 11
677warp_lasx t, 7
678
679/*
680static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
681                    const int16_t *tmp1, const int16_t *tmp2,
682                    const int w, int h,
683                    const int weight HIGHBD_DECL_SUFFIX)
684*/
685
686#define bpc8_sh     5     // sh = intermediate_bits + 1
687#define bpcw8_sh    8     // sh = intermediate_bits + 4
688
689#define bpc_sh   bpc8_sh
690#define bpcw_sh  bpcw8_sh
691
692function avg_8bpc_lsx
693    addi.d        t8,     a0,     0
694
695    clz.w         t0,     a4
696    li.w          t1,     24
697    sub.w         t0,     t0,      t1
698    la.local      t1,     .AVG_LSX_JRTABLE
699    alsl.d        t0,     t0,      t1,    1
700    ld.h          t2,     t0,      0  // The jump addresses are relative to AVG_LSX_JRTABLE
701    add.d         t1,     t1,      t2 // Get absolute address
702    jirl          $r0,    t1,      0
703
704    .align   3
705.AVG_LSX_JRTABLE:
706    .hword .AVG_W128_LSX - .AVG_LSX_JRTABLE
707    .hword .AVG_W64_LSX  - .AVG_LSX_JRTABLE
708    .hword .AVG_W32_LSX  - .AVG_LSX_JRTABLE
709    .hword .AVG_W16_LSX  - .AVG_LSX_JRTABLE
710    .hword .AVG_W8_LSX   - .AVG_LSX_JRTABLE
711    .hword .AVG_W4_LSX   - .AVG_LSX_JRTABLE
712
713.AVG_W4_LSX:
714    vld           vr0,    a2,     0
715    vld           vr1,    a3,     0
716    vadd.h        vr2,    vr0,    vr1
717    vssrarni.bu.h vr3,    vr2,    bpc_sh
718    vstelm.w      vr3,    a0,     0,    0
719    add.d         a0,     a0,     a1
720    vstelm.w      vr3,    a0,     0,    1
721    addi.w        a5,     a5,     -2
722    addi.d        a2,     a2,     16
723    addi.d        a3,     a3,     16
724    add.d         a0,     a0,     a1
725    blt           zero,   a5,     .AVG_W4_LSX
726    b             .AVG_END_LSX
727
728.AVG_W8_LSX:
729    vld           vr0,    a2,     0
730    vld           vr2,    a2,     16
731    vld           vr1,    a3,     0
732    vld           vr3,    a3,     16
733    vadd.h        vr4,    vr0,    vr1
734    vadd.h        vr5,    vr2,    vr3
735    vssrarni.bu.h vr5,    vr4,    bpc_sh
736    addi.w        a5,     a5,     -2
737    addi.d        a2,     a2,     32
738    vstelm.d      vr5,    a0,     0,    0
739    add.d         a0,     a0,     a1
740    vstelm.d      vr5,    a0,     0,    1
741    addi.d        a3,     a3,     32
742    add.d         a0,     a0,     a1
743    blt           zero,   a5,     .AVG_W8_LSX
744    b             .AVG_END_LSX
745
746.AVG_W16_LSX:
747    vld           vr0,    a2,     0
748    vld           vr2,    a2,     16
749    vld           vr1,    a3,     0
750    vld           vr3,    a3,     16
751    vadd.h        vr4,    vr0,    vr1
752    vadd.h        vr5,    vr2,    vr3
753    vssrarni.bu.h vr5,    vr4,    bpc_sh
754    addi.w        a5,     a5,     -1
755    addi.d        a2,     a2,     32
756    vst           vr5,    a0,     0
757    addi.d        a3,     a3,     32
758    add.d         a0,     a0,     a1
759    blt           zero,   a5,     .AVG_W16_LSX
760    b             .AVG_END_LSX
761
762.AVG_W32_LSX:
763    vld           vr0,    a2,     0
764    vld           vr2,    a2,     16
765    vld           vr4,    a2,     32
766    vld           vr6,    a2,     48
767    vld           vr1,    a3,     0
768    vld           vr3,    a3,     16
769    vld           vr5,    a3,     32
770    vld           vr7,    a3,     48
771    vadd.h        vr0,    vr0,    vr1
772    vadd.h        vr2,    vr2,    vr3
773    vadd.h        vr4,    vr4,    vr5
774    vadd.h        vr6,    vr6,    vr7
775    vssrarni.bu.h vr2,    vr0,    bpc_sh
776    vssrarni.bu.h vr6,    vr4,    bpc_sh
777    addi.w        a5,     a5,     -1
778    addi.d        a2,     a2,     64
779    vst           vr2,    a0,     0
780    vst           vr6,    a0,     16
781    addi.d        a3,     a3,     64
782    add.d         a0,     a0,     a1
783    blt           zero,   a5,     .AVG_W32_LSX
784    b             .AVG_END_LSX
785
786.AVG_W64_LSX:
787.rept 4
788    vld           vr0,    a2,     0
789    vld           vr2,    a2,     16
790    vld           vr1,    a3,     0
791    vld           vr3,    a3,     16
792    vadd.h        vr0,    vr0,    vr1
793    vadd.h        vr2,    vr2,    vr3
794    vssrarni.bu.h vr2,    vr0,    bpc_sh
795    addi.d        a2,     a2,     32
796    addi.d        a3,     a3,     32
797    vst           vr2,    a0,     0
798    addi.d        a0,     a0,     16
799.endr
800    addi.w        a5,     a5,     -1
801    add.d         t8,     t8,     a1
802    add.d         a0,     t8,     zero
803    blt           zero,   a5,     .AVG_W64_LSX
804    b             .AVG_END_LSX
805
806.AVG_W128_LSX:
807.rept 8
808    vld           vr0,    a2,     0
809    vld           vr2,    a2,     16
810    vld           vr1,    a3,     0
811    vld           vr3,    a3,     16
812    vadd.h        vr0,    vr0,    vr1
813    vadd.h        vr2,    vr2,    vr3
814    vssrarni.bu.h vr2,    vr0,    bpc_sh
815    addi.d        a2,     a2,     32
816    addi.d        a3,     a3,     32
817    vst           vr2,    a0,     0
818    addi.d        a0,     a0,     16
819.endr
820    addi.w        a5,     a5,     -1
821    add.d         t8,     t8,     a1
822    add.d         a0,     t8,     zero
823    blt           zero,   a5,     .AVG_W128_LSX
824.AVG_END_LSX:
825endfunc
826
827function avg_8bpc_lasx
828    clz.w         t0,     a4
829    li.w          t1,     24
830    sub.w         t0,     t0,      t1
831    la.local      t1,     .AVG_LASX_JRTABLE
832    alsl.d        t0,     t0,      t1,    1
833    ld.h          t2,     t0,      0
834    add.d         t1,     t1,      t2
835    jirl          $r0,    t1,      0
836
837    .align   3
838.AVG_LASX_JRTABLE:
839    .hword .AVG_W128_LASX - .AVG_LASX_JRTABLE
840    .hword .AVG_W64_LASX  - .AVG_LASX_JRTABLE
841    .hword .AVG_W32_LASX  - .AVG_LASX_JRTABLE
842    .hword .AVG_W16_LASX  - .AVG_LASX_JRTABLE
843    .hword .AVG_W8_LASX   - .AVG_LASX_JRTABLE
844    .hword .AVG_W4_LASX   - .AVG_LASX_JRTABLE
845
846.AVG_W4_LASX:
847    vld            vr0,    a2,     0
848    vld            vr1,    a3,     0
849    vadd.h         vr0,    vr0,    vr1
850    vssrarni.bu.h  vr1,    vr0,    bpc_sh
851    vstelm.w       vr1,    a0,     0,    0
852    add.d          a0,     a0,     a1
853    vstelm.w       vr1,    a0,     0,    1
854    addi.w         a5,     a5,     -2
855    addi.d         a2,     a2,     16
856    addi.d         a3,     a3,     16
857    add.d          a0,     a0,     a1
858    blt            zero,   a5,     .AVG_W4_LASX
859    b              .AVG_END_LASX
860.AVG_W8_LASX:
861    xvld           xr0,    a2,     0
862    xvld           xr1,    a3,     0
863    xvadd.h        xr2,    xr0,    xr1
864    xvssrarni.bu.h xr1,    xr2,    bpc_sh
865    xvstelm.d      xr1,    a0,     0,    0
866    add.d          a0,     a0,     a1
867    xvstelm.d      xr1,    a0,     0,    2
868    addi.w         a5,     a5,     -2
869    addi.d         a2,     a2,     32
870    addi.d         a3,     a3,     32
871    add.d          a0,     a1,     a0
872    blt            zero,   a5,     .AVG_W8_LASX
873    b              .AVG_END_LASX
874.AVG_W16_LASX:
875    xvld           xr0,    a2,     0
876    xvld           xr2,    a2,     32
877    xvld           xr1,    a3,     0
878    xvld           xr3,    a3,     32
879    xvadd.h        xr4,    xr0,    xr1
880    xvadd.h        xr5,    xr2,    xr3
881    xvssrarni.bu.h xr5,    xr4,    bpc_sh
882    xvpermi.d      xr2,    xr5,    0xd8
883    xvpermi.d      xr3,    xr5,    0x8d
884    vst            vr2,    a0,     0
885    vstx           vr3,    a0,     a1
886    addi.w         a5,     a5,     -2
887    addi.d         a2,     a2,     64
888    addi.d         a3,     a3,     64
889    alsl.d         a0,     a1,     a0,   1
890    blt            zero,   a5,     .AVG_W16_LASX
891    b              .AVG_END_LASX
892.AVG_W32_LASX:
893    xvld           xr0,    a2,     0
894    xvld           xr2,    a2,     32
895    xvld           xr1,    a3,     0
896    xvld           xr3,    a3,     32
897    xvadd.h        xr4,    xr0,    xr1
898    xvadd.h        xr5,    xr2,    xr3
899    xvssrarni.bu.h xr5,    xr4,    bpc_sh
900    xvpermi.d      xr6,    xr5,    0xd8
901    xvst           xr6,    a0,     0
902    addi.w         a5,     a5,     -1
903    addi.d         a2,     a2,     64
904    addi.d         a3,     a3,     64
905    add.d          a0,     a0,     a1
906    blt            zero,   a5,     .AVG_W32_LASX
907    b              .AVG_END_LASX
908.AVG_W64_LASX:
909    xvld           xr0,    a2,     0
910    xvld           xr2,    a2,     32
911    xvld           xr4,    a2,     64
912    xvld           xr6,    a2,     96
913    xvld           xr1,    a3,     0
914    xvld           xr3,    a3,     32
915    xvld           xr5,    a3,     64
916    xvld           xr7,    a3,     96
917    xvadd.h        xr0,    xr0,    xr1
918    xvadd.h        xr2,    xr2,    xr3
919    xvadd.h        xr4,    xr4,    xr5
920    xvadd.h        xr6,    xr6,    xr7
921    xvssrarni.bu.h xr2,    xr0,    bpc_sh
922    xvssrarni.bu.h xr6,    xr4,    bpc_sh
923    xvpermi.d      xr1,    xr2,    0xd8
924    xvpermi.d      xr3,    xr6,    0xd8
925    xvst           xr1,    a0,     0
926    xvst           xr3,    a0,     32
927    addi.w         a5,     a5,     -1
928    addi.d         a2,     a2,     128
929    addi.d         a3,     a3,     128
930    add.d          a0,     a0,     a1
931    blt            zero,   a5,     .AVG_W64_LASX
932    b              .AVG_END_LASX
933.AVG_W128_LASX:
934    xvld           xr0,    a2,     0
935    xvld           xr2,    a2,     32
936    xvld           xr4,    a2,     64
937    xvld           xr6,    a2,     96
938    xvld           xr8,    a2,     128
939    xvld           xr10,   a2,     160
940    xvld           xr12,   a2,     192
941    xvld           xr14,   a2,     224
942    xvld           xr1,    a3,     0
943    xvld           xr3,    a3,     32
944    xvld           xr5,    a3,     64
945    xvld           xr7,    a3,     96
946    xvld           xr9,    a3,     128
947    xvld           xr11,   a3,     160
948    xvld           xr13,   a3,     192
949    xvld           xr15,   a3,     224
950    xvadd.h        xr0,    xr0,    xr1
951    xvadd.h        xr2,    xr2,    xr3
952    xvadd.h        xr4,    xr4,    xr5
953    xvadd.h        xr6,    xr6,    xr7
954    xvadd.h        xr8,    xr8,    xr9
955    xvadd.h        xr10,   xr10,   xr11
956    xvadd.h        xr12,   xr12,   xr13
957    xvadd.h        xr14,   xr14,   xr15
958    xvssrarni.bu.h xr2,    xr0,    bpc_sh
959    xvssrarni.bu.h xr6,    xr4,    bpc_sh
960    xvssrarni.bu.h xr10,   xr8,    bpc_sh
961    xvssrarni.bu.h xr14,   xr12,   bpc_sh
962    xvpermi.d      xr1,    xr2,    0xd8
963    xvpermi.d      xr3,    xr6,    0xd8
964    xvpermi.d      xr5,    xr10,   0xd8
965    xvpermi.d      xr7,    xr14,   0xd8
966    xvst           xr1,    a0,     0
967    xvst           xr3,    a0,     32
968    xvst           xr5,    a0,     64
969    xvst           xr7,    a0,     96
970    addi.w         a5,     a5,     -1
971    addi.d         a2,     a2,     256
972    addi.d         a3,     a3,     256
973    add.d          a0,     a0,     a1
974    blt            zero,   a5,     .AVG_W128_LASX
975.AVG_END_LASX:
976endfunc
977
978function w_avg_8bpc_lsx
979    addi.d        t8,     a0,     0
980    li.w          t2,     16
981    sub.w         t2,     t2,     a6  // 16 - weight
982    vreplgr2vr.h  vr21,   a6
983    vreplgr2vr.h  vr22,   t2
984
985    clz.w         t0,     a4
986    li.w          t1,     24
987    sub.w         t0,     t0,      t1
988    la.local      t1,     .W_AVG_LSX_JRTABLE
989    alsl.d        t0,     t0,      t1,    1
990    ld.h          t2,     t0,      0
991    add.d         t1,     t1,      t2
992    jirl          $r0,    t1,      0
993
994    .align   3
995.W_AVG_LSX_JRTABLE:
996    .hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE
997    .hword .W_AVG_W64_LSX  - .W_AVG_LSX_JRTABLE
998    .hword .W_AVG_W32_LSX  - .W_AVG_LSX_JRTABLE
999    .hword .W_AVG_W16_LSX  - .W_AVG_LSX_JRTABLE
1000    .hword .W_AVG_W8_LSX   - .W_AVG_LSX_JRTABLE
1001    .hword .W_AVG_W4_LSX   - .W_AVG_LSX_JRTABLE
1002
1003.W_AVG_W4_LSX:
1004    vld           vr0,    a2,     0
1005    vld           vr1,    a3,     0
1006    vmulwev.w.h   vr2,    vr0,    vr21
1007    vmulwod.w.h   vr3,    vr0,    vr21
1008    vmaddwev.w.h  vr2,    vr1,    vr22
1009    vmaddwod.w.h  vr3,    vr1,    vr22
1010    vssrarni.hu.w vr3,    vr2,    bpcw_sh
1011    vssrlni.bu.h  vr1,    vr3,    0
1012    vpickod.w     vr4,    vr2,    vr1
1013    vilvl.b       vr0,    vr4,    vr1
1014    fst.s         f0,     a0,     0
1015    add.d         a0,     a0,     a1
1016    vstelm.w      vr0,    a0,     0,   1
1017    addi.w        a5,     a5,     -2
1018    addi.d        a2,     a2,     16
1019    addi.d        a3,     a3,     16
1020    add.d         a0,     a1,     a0
1021    blt           zero,   a5,     .W_AVG_W4_LSX
1022    b             .W_AVG_END_LSX
1023.W_AVG_W8_LSX:
1024    vld           vr0,    a2,     0
1025    vld           vr1,    a3,     0
1026    vmulwev.w.h   vr2,    vr0,    vr21
1027    vmulwod.w.h   vr3,    vr0,    vr21
1028    vmaddwev.w.h  vr2,    vr1,    vr22
1029    vmaddwod.w.h  vr3,    vr1,    vr22
1030    vssrarni.hu.w vr3,    vr2,    bpcw_sh
1031    vssrlni.bu.h  vr1,    vr3,    0
1032    vpickod.w     vr4,    vr2,    vr1
1033    vilvl.b       vr0,    vr4,    vr1
1034    fst.d         f0,     a0,     0
1035    addi.w        a5,     a5,     -1
1036    addi.d        a2,     a2,     16
1037    addi.d        a3,     a3,     16
1038    add.d         a0,     a0,     a1
1039    blt           zero,   a5,     .W_AVG_W8_LSX
1040    b             .W_AVG_END_LSX
1041.W_AVG_W16_LSX:
1042    vld           vr0,    a2,     0
1043    vld           vr2,    a2,     16
1044    vld           vr1,    a3,     0
1045    vld           vr3,    a3,     16
1046    vmulwev.w.h   vr4,    vr0,    vr21
1047    vmulwod.w.h   vr5,    vr0,    vr21
1048    vmulwev.w.h   vr6,    vr2,    vr21
1049    vmulwod.w.h   vr7,    vr2,    vr21
1050    vmaddwev.w.h  vr4,    vr1,    vr22
1051    vmaddwod.w.h  vr5,    vr1,    vr22
1052    vmaddwev.w.h  vr6,    vr3,    vr22
1053    vmaddwod.w.h  vr7,    vr3,    vr22
1054    vssrarni.hu.w vr6,    vr4,    bpcw_sh
1055    vssrarni.hu.w vr7,    vr5,    bpcw_sh
1056    vssrlrni.bu.h vr7,    vr6,    0
1057    vshuf4i.w     vr8,    vr7,    0x4E
1058    vilvl.b       vr0,    vr8,    vr7
1059    vst           vr0,    a0,     0
1060    addi.w        a5,     a5,     -1
1061    addi.d        a2,     a2,     32
1062    addi.d        a3,     a3,     32
1063    add.d         a0,     a0,     a1
1064    blt           zero,   a5,     .W_AVG_W16_LSX
1065    b             .W_AVG_END_LSX
1066.W_AVG_W32_LSX:
1067.rept 2
1068    vld           vr0,    a2,     0
1069    vld           vr2,    a2,     16
1070    vld           vr1,    a3,     0
1071    vld           vr3,    a3,     16
1072    vmulwev.w.h   vr4,    vr0,    vr21
1073    vmulwod.w.h   vr5,    vr0,    vr21
1074    vmulwev.w.h   vr6,    vr2,    vr21
1075    vmulwod.w.h   vr7,    vr2,    vr21
1076    vmaddwev.w.h  vr4,    vr1,    vr22
1077    vmaddwod.w.h  vr5,    vr1,    vr22
1078    vmaddwev.w.h  vr6,    vr3,    vr22
1079    vmaddwod.w.h  vr7,    vr3,    vr22
1080    vssrarni.hu.w vr6,    vr4,    bpcw_sh
1081    vssrarni.hu.w vr7,    vr5,    bpcw_sh
1082    vssrlrni.bu.h vr7,    vr6,    0
1083    vshuf4i.w     vr8,    vr7,    0x4E
1084    vilvl.b       vr0,    vr8,    vr7
1085    vst           vr0,    a0,     0
1086    addi.d        a2,     a2,     32
1087    addi.d        a3,     a3,     32
1088    addi.d        a0,     a0,     16
1089.endr
1090    addi.w        a5,     a5,     -1
1091    add.d         t8,     t8,     a1
1092    add.d         a0,     t8,     zero
1093    blt           zero,   a5,     .W_AVG_W32_LSX
1094    b             .W_AVG_END_LSX
1095
1096.W_AVG_W64_LSX:
1097.rept 4
1098    vld           vr0,    a2,     0
1099    vld           vr2,    a2,     16
1100    vld           vr1,    a3,     0
1101    vld           vr3,    a3,     16
1102    vmulwev.w.h   vr4,    vr0,    vr21
1103    vmulwod.w.h   vr5,    vr0,    vr21
1104    vmulwev.w.h   vr6,    vr2,    vr21
1105    vmulwod.w.h   vr7,    vr2,    vr21
1106    vmaddwev.w.h  vr4,    vr1,    vr22
1107    vmaddwod.w.h  vr5,    vr1,    vr22
1108    vmaddwev.w.h  vr6,    vr3,    vr22
1109    vmaddwod.w.h  vr7,    vr3,    vr22
1110    vssrarni.hu.w vr6,    vr4,    bpcw_sh
1111    vssrarni.hu.w vr7,    vr5,    bpcw_sh
1112    vssrlrni.bu.h vr7,    vr6,    0
1113    vshuf4i.w     vr8,    vr7,    0x4E
1114    vilvl.b       vr0,    vr8,    vr7
1115    vst           vr0,    a0,     0
1116    addi.d        a2,     a2,     32
1117    addi.d        a3,     a3,     32
1118    addi.d        a0,     a0,     16
1119.endr
1120    addi.w        a5,     a5,     -1
1121    add.d         t8,     t8,     a1
1122    add.d         a0,     t8,     zero
1123    blt           zero,   a5,     .W_AVG_W64_LSX
1124    b             .W_AVG_END_LSX
1125
1126.W_AVG_W128_LSX:
1127.rept 8
1128    vld           vr0,    a2,     0
1129    vld           vr2,    a2,     16
1130    vld           vr1,    a3,     0
1131    vld           vr3,    a3,     16
1132    vmulwev.w.h   vr4,    vr0,    vr21
1133    vmulwod.w.h   vr5,    vr0,    vr21
1134    vmulwev.w.h   vr6,    vr2,    vr21
1135    vmulwod.w.h   vr7,    vr2,    vr21
1136    vmaddwev.w.h  vr4,    vr1,    vr22
1137    vmaddwod.w.h  vr5,    vr1,    vr22
1138    vmaddwev.w.h  vr6,    vr3,    vr22
1139    vmaddwod.w.h  vr7,    vr3,    vr22
1140    vssrarni.hu.w vr6,    vr4,    bpcw_sh
1141    vssrarni.hu.w vr7,    vr5,    bpcw_sh
1142    vssrlrni.bu.h vr7,    vr6,    0
1143    vshuf4i.w     vr8,    vr7,    0x4E
1144    vilvl.b       vr0,    vr8,    vr7
1145    vst           vr0,    a0,     0
1146    addi.d        a2,     a2,     32
1147    addi.d        a3,     a3,     32
1148    addi.d        a0,     a0,     16
1149.endr
1150    addi.w        a5,     a5,     -1
1151    add.d         t8,     t8,     a1
1152    add.d         a0,     t8,     zero
1153    blt           zero,   a5,     .W_AVG_W128_LSX
1154.W_AVG_END_LSX:
1155endfunc
1156
1157function w_avg_8bpc_lasx
1158    addi.d        t8,     a0,     0
1159    li.w          t2,     16
1160    sub.w         t2,     t2,     a6  // 16 - weight
1161    xvreplgr2vr.h xr21,   a6
1162    xvreplgr2vr.h xr22,   t2
1163
1164    clz.w         t0,     a4
1165    li.w          t1,     24
1166    sub.w         t0,     t0,      t1
1167    la.local      t1,     .W_AVG_LASX_JRTABLE
1168    alsl.d        t0,     t0,      t1,    1
1169    ld.h          t2,     t0,      0
1170    add.d         t1,     t1,      t2
1171    jirl          $r0,    t1,      0
1172
1173    .align   3
1174.W_AVG_LASX_JRTABLE:
1175    .hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE
1176    .hword .W_AVG_W64_LASX  - .W_AVG_LASX_JRTABLE
1177    .hword .W_AVG_W32_LASX  - .W_AVG_LASX_JRTABLE
1178    .hword .W_AVG_W16_LASX  - .W_AVG_LASX_JRTABLE
1179    .hword .W_AVG_W8_LASX   - .W_AVG_LASX_JRTABLE
1180    .hword .W_AVG_W4_LASX   - .W_AVG_LASX_JRTABLE
1181
1182.W_AVG_W4_LASX:
1183    vld            vr0,    a2,     0
1184    vld            vr1,    a3,     0
1185    xvpermi.d      xr2,    xr0,    0xD8
1186    xvpermi.d      xr3,    xr1,    0xD8
1187    xvilvl.h       xr4,    xr3,    xr2
1188    xvmulwev.w.h   xr0,    xr4,    xr21
1189    xvmaddwod.w.h  xr0,    xr4,    xr22
1190    xvssrarni.hu.w xr1,    xr0,    bpcw_sh
1191    xvssrlni.bu.h  xr0,    xr1,    0
1192    fst.s          f0,     a0,     0
1193    add.d          a0,     a0,     a1
1194    xvstelm.w      xr0,    a0,     0,     4
1195    addi.w         a5,     a5,     -2
1196    addi.d         a2,     a2,     16
1197    addi.d         a3,     a3,     16
1198    add.d          a0,     a1,     a0
1199    blt            zero,   a5,     .W_AVG_W4_LASX
1200    b              .W_AVG_END_LASX
1201
1202.W_AVG_W8_LASX:
1203    xvld           xr0,    a2,     0
1204    xvld           xr1,    a3,     0
1205    xvmulwev.w.h   xr2,    xr0,    xr21
1206    xvmulwod.w.h   xr3,    xr0,    xr21
1207    xvmaddwev.w.h  xr2,    xr1,    xr22
1208    xvmaddwod.w.h  xr3,    xr1,    xr22
1209    xvssrarni.hu.w xr3,    xr2,    bpcw_sh
1210    xvssrlni.bu.h  xr1,    xr3,    0
1211    xvpickod.w     xr4,    xr2,    xr1
1212    xvilvl.b       xr0,    xr4,    xr1
1213    xvstelm.d      xr0,    a0,     0,     0
1214    add.d          a0,     a0,     a1
1215    xvstelm.d      xr0,    a0,     0,     2
1216    addi.w         a5,     a5,     -2
1217    addi.d         a2,     a2,     32
1218    addi.d         a3,     a3,     32
1219    add.d          a0,     a0,     a1
1220    blt            zero,   a5,     .W_AVG_W8_LASX
1221    b              .W_AVG_END_LASX
1222
1223.W_AVG_W16_LASX:
1224    xvld           xr0,    a2,     0
1225    xvld           xr1,    a3,     0
1226    xvmulwev.w.h   xr2,    xr0,    xr21
1227    xvmulwod.w.h   xr3,    xr0,    xr21
1228    xvmaddwev.w.h  xr2,    xr1,    xr22
1229    xvmaddwod.w.h  xr3,    xr1,    xr22
1230    xvssrarni.hu.w xr3,    xr2,    bpcw_sh
1231    xvssrlni.bu.h  xr1,    xr3,    0
1232    xvpickod.w     xr4,    xr2,    xr1
1233    xvilvl.b       xr0,    xr4,    xr1
1234    xvpermi.d      xr1,    xr0,    0xD8
1235    vst            vr1,    a0,     0
1236    addi.w         a5,     a5,     -1
1237    addi.d         a2,     a2,     32
1238    addi.d         a3,     a3,     32
1239    add.d          a0,     a0,     a1
1240    blt            zero,   a5,     .W_AVG_W16_LASX
1241    b              .W_AVG_END_LSX
1242
1243.W_AVG_W32_LASX:
1244    xvld           xr0,    a2,     0
1245    xvld           xr2,    a2,     32
1246    xvld           xr1,    a3,     0
1247    xvld           xr3,    a3,     32
1248    xvmulwev.w.h   xr4,    xr0,    xr21
1249    xvmulwod.w.h   xr5,    xr0,    xr21
1250    xvmulwev.w.h   xr6,    xr2,    xr21
1251    xvmulwod.w.h   xr7,    xr2,    xr21
1252    xvmaddwev.w.h  xr4,    xr1,    xr22
1253    xvmaddwod.w.h  xr5,    xr1,    xr22
1254    xvmaddwev.w.h  xr6,    xr3,    xr22
1255    xvmaddwod.w.h  xr7,    xr3,    xr22
1256    xvssrarni.hu.w xr6,    xr4,    bpcw_sh
1257    xvssrarni.hu.w xr7,    xr5,    bpcw_sh
1258    xvssrlni.bu.h  xr7,    xr6,    0
1259    xvshuf4i.w     xr8,    xr7,    0x4E
1260    xvilvl.b       xr9,    xr8,    xr7
1261    xvpermi.d      xr0,    xr9,    0xD8
1262    xvst           xr0,    a0,     0
1263    addi.w         a5,     a5,     -1
1264    addi.d         a2,     a2,     64
1265    addi.d         a3,     a3,     64
1266    add.d          a0,     a0,     a1
1267    blt            zero,   a5,     .W_AVG_W32_LASX
1268    b              .W_AVG_END_LASX
1269
1270.W_AVG_W64_LASX:
1271.rept 2
1272    xvld           xr0,    a2,     0
1273    xvld           xr2,    a2,     32
1274    xvld           xr1,    a3,     0
1275    xvld           xr3,    a3,     32
1276    xvmulwev.w.h   xr4,    xr0,    xr21
1277    xvmulwod.w.h   xr5,    xr0,    xr21
1278    xvmulwev.w.h   xr6,    xr2,    xr21
1279    xvmulwod.w.h   xr7,    xr2,    xr21
1280    xvmaddwev.w.h  xr4,    xr1,    xr22
1281    xvmaddwod.w.h  xr5,    xr1,    xr22
1282    xvmaddwev.w.h  xr6,    xr3,    xr22
1283    xvmaddwod.w.h  xr7,    xr3,    xr22
1284    xvssrarni.hu.w xr6,    xr4,    bpcw_sh
1285    xvssrarni.hu.w xr7,    xr5,    bpcw_sh
1286    xvssrlni.bu.h  xr7,    xr6,    0
1287    xvshuf4i.w     xr8,    xr7,    0x4E
1288    xvilvl.b       xr9,    xr8,    xr7
1289    xvpermi.d      xr0,    xr9,    0xD8
1290    xvst           xr0,    a0,     0
1291    addi.d         a2,     a2,     64
1292    addi.d         a3,     a3,     64
1293    addi.d         a0,     a0,     32
1294.endr
1295    addi.w         a5,     a5,     -1
1296    add.d          t8,     t8,     a1
1297    add.d          a0,     t8,     zero
1298    blt            zero,   a5,     .W_AVG_W64_LASX
1299    b              .W_AVG_END_LASX
1300
1301.W_AVG_W128_LASX:
1302.rept 4
1303    xvld           xr0,    a2,     0
1304    xvld           xr2,    a2,     32
1305    xvld           xr1,    a3,     0
1306    xvld           xr3,    a3,     32
1307    xvmulwev.w.h   xr4,    xr0,    xr21
1308    xvmulwod.w.h   xr5,    xr0,    xr21
1309    xvmulwev.w.h   xr6,    xr2,    xr21
1310    xvmulwod.w.h   xr7,    xr2,    xr21
1311    xvmaddwev.w.h  xr4,    xr1,    xr22
1312    xvmaddwod.w.h  xr5,    xr1,    xr22
1313    xvmaddwev.w.h  xr6,    xr3,    xr22
1314    xvmaddwod.w.h  xr7,    xr3,    xr22
1315    xvssrarni.hu.w xr6,    xr4,    bpcw_sh
1316    xvssrarni.hu.w xr7,    xr5,    bpcw_sh
1317    xvssrlni.bu.h  xr7,    xr6,    0
1318    xvshuf4i.w     xr8,    xr7,    0x4E
1319    xvilvl.b       xr9,    xr8,    xr7
1320    xvpermi.d      xr0,    xr9,    0xD8
1321    xvst           xr0,    a0,     0
1322    addi.d         a2,     a2,     64
1323    addi.d         a3,     a3,     64
1324    addi.d         a0,     a0,     32
1325.endr
1326
1327    addi.w         a5,     a5,     -1
1328    add.d          t8,     t8,     a1
1329    add.d          a0,     t8,     zero
1330    blt            zero,   a5,     .W_AVG_W128_LASX
1331.W_AVG_END_LASX:
1332endfunc
1333
1334#undef bpc_sh
1335#undef bpcw_sh
1336
1337#define mask_sh         10
1338/*
1339static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
1340                   const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
1341                   const uint8_t *mask HIGHBD_DECL_SUFFIX)
1342*/
1343function mask_8bpc_lsx
1344    vldi          vr21,   0x440   // 64
1345    vxor.v        vr19,   vr19,   vr19
1346    addi.d        t8,     a0,     0
1347    clz.w         t0,     a4
1348    li.w          t1,     24
1349    sub.w         t0,     t0,      t1
1350    la.local      t1,     .MASK_LSX_JRTABLE
1351    alsl.d        t0,     t0,      t1,    1
1352    ld.h          t2,     t0,      0
1353    add.d         t1,     t1,      t2
1354    jirl          $r0,    t1,      0
1355
1356    .align   3
1357.MASK_LSX_JRTABLE:
1358    .hword .MASK_W128_LSX - .MASK_LSX_JRTABLE
1359    .hword .MASK_W64_LSX  - .MASK_LSX_JRTABLE
1360    .hword .MASK_W32_LSX  - .MASK_LSX_JRTABLE
1361    .hword .MASK_W16_LSX  - .MASK_LSX_JRTABLE
1362    .hword .MASK_W8_LSX   - .MASK_LSX_JRTABLE
1363    .hword .MASK_W4_LSX   - .MASK_LSX_JRTABLE
1364
1365.MASK_W4_LSX:
1366    vld           vr0,     a2,     0
1367    vld           vr1,     a3,     0
1368    fld.d         f22,     a6,     0
1369
1370    vilvl.b       vr2,    vr19,   vr22
1371    vsub.h        vr3,    vr21,   vr2
1372
1373    vmulwev.w.h   vr4,    vr0,    vr2
1374    vmulwod.w.h   vr5,    vr0,    vr2
1375    vmaddwev.w.h  vr4,    vr1,    vr3
1376    vmaddwod.w.h  vr5,    vr1,    vr3
1377    vssrarni.hu.w vr5,    vr4,    mask_sh
1378    vssrlrni.bu.h vr1,    vr5,    0
1379    vpickod.w     vr4,    vr2,    vr1
1380    vilvl.b       vr0,    vr4,    vr1
1381    fst.s         f0,     a0,     0
1382    add.d         a0,     a0,     a1
1383    vstelm.w      vr0,    a0,     0,    1
1384    addi.d        a2,     a2,     16
1385    addi.d        a3,     a3,     16
1386    addi.d        a6,     a6,     8
1387    add.d         a0,     a0,     a1
1388    addi.w        a5,     a5,     -2
1389    blt           zero,   a5,     .MASK_W4_LSX
1390    b             .MASK_END_LSX
1391.MASK_W8_LSX:
1392    vld           vr0,    a2,     0
1393    vld           vr10,   a2,     16
1394    vld           vr1,    a3,     0
1395    vld           vr11,   a3,     16
1396    vld           vr22,   a6,     0
1397
1398    vilvl.b       vr2,    vr19,   vr22
1399    vilvh.b       vr12,   vr19,   vr22
1400    vsub.h        vr3,    vr21,   vr2
1401    vsub.h        vr13,   vr21,   vr12
1402
1403    vmulwev.w.h   vr4,    vr0,    vr2
1404    vmulwod.w.h   vr5,    vr0,    vr2
1405    vmulwev.w.h   vr14,   vr10,   vr12
1406    vmulwod.w.h   vr15,   vr10,   vr12
1407    vmaddwev.w.h  vr4,    vr1,    vr3
1408    vmaddwod.w.h  vr5,    vr1,    vr3
1409    vmaddwev.w.h  vr14,   vr11,   vr13
1410    vmaddwod.w.h  vr15,   vr11,   vr13
1411    vssrarni.hu.w vr14,   vr4,    mask_sh
1412    vssrarni.hu.w vr15,   vr5,    mask_sh
1413    vssrlrni.bu.h vr15,   vr14,   0
1414    vshuf4i.w     vr6,    vr15,   0x4E
1415    vilvl.b       vr0,    vr6,    vr15
1416    fst.d         f0,     a0,     0
1417    add.d         a0,     a0,     a1
1418    vstelm.d      vr0,    a0,     0,   1
1419    addi.d        a2,     a2,     32
1420    addi.d        a3,     a3,     32
1421    addi.d        a6,     a6,     16
1422    add.d         a0,     a0,     a1
1423    addi.w        a5,     a5,     -2
1424    blt           zero,   a5,     .MASK_W8_LSX
1425    b             .MASK_END_LSX
1426
1427.MASK_W16_LSX:
1428    vld           vr0,    a2,     0
1429    vld           vr10,   a2,     16
1430    vld           vr1,    a3,     0
1431    vld           vr11,   a3,     16
1432    vld           vr22,   a6,     0
1433
1434    vilvl.b       vr2,    vr19,   vr22
1435    vilvh.b       vr12,   vr19,   vr22
1436    vsub.h        vr3,    vr21,   vr2
1437    vsub.h        vr13,   vr21,   vr12
1438
1439    vmulwev.w.h   vr4,    vr0,    vr2
1440    vmulwod.w.h   vr5,    vr0,    vr2
1441    vmulwev.w.h   vr14,   vr10,   vr12
1442    vmulwod.w.h   vr15,   vr10,   vr12
1443    vmaddwev.w.h  vr4,    vr1,    vr3
1444    vmaddwod.w.h  vr5,    vr1,    vr3
1445    vmaddwev.w.h  vr14,   vr11,   vr13
1446    vmaddwod.w.h  vr15,   vr11,   vr13
1447    vssrarni.hu.w vr14,   vr4,    mask_sh
1448    vssrarni.hu.w vr15,   vr5,    mask_sh
1449    vssrlrni.bu.h vr15,   vr14,   0
1450    vshuf4i.w     vr6,    vr15,   0x4E
1451    vilvl.b       vr0,    vr6,    vr15
1452    vst           vr0,    a0,     0
1453    addi.d        a2,     a2,     32
1454    addi.d        a3,     a3,     32
1455    addi.d        a6,     a6,     16
1456    add.d         a0,     a0,     a1
1457    addi.w        a5,     a5,     -1
1458    blt           zero,   a5,     .MASK_W16_LSX
1459    b             .MASK_END_LSX
1460.MASK_W32_LSX:
1461.rept 2
1462    vld           vr0,    a2,     0
1463    vld           vr10,   a2,     16
1464    vld           vr1,    a3,     0
1465    vld           vr11,   a3,     16
1466    vld           vr22,   a6,     0
1467    vilvl.b       vr2,    vr19,   vr22
1468    vilvh.b       vr12,   vr19,   vr22
1469    vsub.h        vr3,    vr21,   vr2
1470    vsub.h        vr13,   vr21,   vr12
1471    vmulwev.w.h   vr4,    vr0,    vr2
1472    vmulwod.w.h   vr5,    vr0,    vr2
1473    vmulwev.w.h   vr14,   vr10,   vr12
1474    vmulwod.w.h   vr15,   vr10,   vr12
1475    vmaddwev.w.h  vr4,    vr1,    vr3
1476    vmaddwod.w.h  vr5,    vr1,    vr3
1477    vmaddwev.w.h  vr14,   vr11,   vr13
1478    vmaddwod.w.h  vr15,   vr11,   vr13
1479    vssrarni.hu.w vr14,   vr4,    mask_sh
1480    vssrarni.hu.w vr15,   vr5,    mask_sh
1481    vssrlrni.bu.h vr15,   vr14,   0
1482    vshuf4i.w     vr6,    vr15,   0x4E
1483    vilvl.b       vr0,    vr6,    vr15
1484    vst           vr0,    a0,     0
1485    addi.d        a2,     a2,     32
1486    addi.d        a3,     a3,     32
1487    addi.d        a6,     a6,     16
1488    addi.d        a0,     a0,     16
1489.endr
1490    add.d         t8,     t8,     a1
1491    add.d         a0,     t8,     zero
1492    addi.w        a5,     a5,     -1
1493    blt           zero,   a5,     .MASK_W32_LSX
1494    b             .MASK_END_LSX
1495.MASK_W64_LSX:
1496.rept 4
1497    vld           vr0,    a2,     0
1498    vld           vr10,   a2,     16
1499    vld           vr1,    a3,     0
1500    vld           vr11,   a3,     16
1501    vld           vr22,   a6,     0
1502    vilvl.b       vr2,    vr19,   vr22
1503    vilvh.b       vr12,   vr19,   vr22
1504    vsub.h        vr3,    vr21,   vr2
1505    vsub.h        vr13,   vr21,   vr12
1506    vmulwev.w.h   vr4,    vr0,    vr2
1507    vmulwod.w.h   vr5,    vr0,    vr2
1508    vmulwev.w.h   vr14,   vr10,   vr12
1509    vmulwod.w.h   vr15,   vr10,   vr12
1510    vmaddwev.w.h  vr4,    vr1,    vr3
1511    vmaddwod.w.h  vr5,    vr1,    vr3
1512    vmaddwev.w.h  vr14,   vr11,   vr13
1513    vmaddwod.w.h  vr15,   vr11,   vr13
1514    vssrarni.hu.w vr14,   vr4,    mask_sh
1515    vssrarni.hu.w vr15,   vr5,    mask_sh
1516    vssrlrni.bu.h vr15,   vr14,   0
1517    vshuf4i.w     vr6,    vr15,   0x4E
1518    vilvl.b       vr0,    vr6,    vr15
1519    vst           vr0,    a0,     0
1520    addi.d        a2,     a2,     32
1521    addi.d        a3,     a3,     32
1522    addi.d        a6,     a6,     16
1523    addi.d        a0,     a0,     16
1524.endr
1525    add.d         t8,     t8,     a1
1526    add.d         a0,     t8,     zero
1527    addi.w        a5,     a5,     -1
1528    blt           zero,   a5,     .MASK_W64_LSX
1529    b             .MASK_END_LSX
1530.MASK_W128_LSX:
1531.rept 8
1532    vld           vr0,    a2,     0
1533    vld           vr10,   a2,     16
1534    vld           vr1,    a3,     0
1535    vld           vr11,   a3,     16
1536    vld           vr22,   a6,     0
1537    vilvl.b       vr2,    vr19,   vr22
1538    vilvh.b       vr12,   vr19,   vr22
1539    vsub.h        vr3,    vr21,   vr2
1540    vsub.h        vr13,   vr21,   vr12
1541    vmulwev.w.h   vr4,    vr0,    vr2
1542    vmulwod.w.h   vr5,    vr0,    vr2
1543    vmulwev.w.h   vr14,   vr10,   vr12
1544    vmulwod.w.h   vr15,   vr10,   vr12
1545    vmaddwev.w.h  vr4,    vr1,    vr3
1546    vmaddwod.w.h  vr5,    vr1,    vr3
1547    vmaddwev.w.h  vr14,   vr11,   vr13
1548    vmaddwod.w.h  vr15,   vr11,   vr13
1549    vssrarni.hu.w vr14,   vr4,    mask_sh
1550    vssrarni.hu.w vr15,   vr5,    mask_sh
1551    vssrlrni.bu.h vr15,   vr14,   0
1552    vshuf4i.w     vr6,    vr15,   0x4E
1553    vilvl.b       vr0,    vr6,    vr15
1554    vst           vr0,    a0,     0
1555    addi.d        a2,     a2,     32
1556    addi.d        a3,     a3,     32
1557    addi.d        a6,     a6,     16
1558    addi.d        a0,     a0,     16
1559.endr
1560    add.d         t8,     t8,     a1
1561    add.d         a0,     t8,     zero
1562    addi.w        a5,     a5,     -1
1563    blt           zero,   a5,     .MASK_W128_LSX
1564.MASK_END_LSX:
1565endfunc
1566
1567function mask_8bpc_lasx
1568    xvldi         xr21,   0x440   // 64
1569    xvxor.v       xr19,   xr19,   xr19
1570    addi.d        t8,     a0,     0
1571    clz.w         t0,     a4
1572    li.w          t1,     24
1573    sub.w         t0,     t0,      t1
1574    la.local      t1,     .MASK_LASX_JRTABLE
1575    alsl.d        t0,     t0,      t1,    1
1576    ld.h          t2,     t0,      0
1577    add.d         t1,     t1,      t2
1578    jirl          $r0,    t1,      0
1579
1580    .align   3
1581.MASK_LASX_JRTABLE:
1582    .hword .MASK_W128_LASX - .MASK_LASX_JRTABLE
1583    .hword .MASK_W64_LASX  - .MASK_LASX_JRTABLE
1584    .hword .MASK_W32_LASX  - .MASK_LASX_JRTABLE
1585    .hword .MASK_W16_LASX  - .MASK_LASX_JRTABLE
1586    .hword .MASK_W8_LASX   - .MASK_LASX_JRTABLE
1587    .hword .MASK_W4_LASX   - .MASK_LASX_JRTABLE
1588
1589.MASK_W4_LASX:
1590    vld            vr0,    a2,     0
1591    vld            vr1,    a3,     0
1592    fld.d          f22,    a6,     0
1593
1594    vilvl.h        vr4,    vr1,    vr0
1595    vilvh.h        vr14,   vr1,    vr0
1596    vilvl.b        vr2,    vr19,   vr22
1597    vsub.h         vr3,    vr21,   vr2
1598    xvpermi.q      xr14,   xr4,    0x20
1599    vilvl.h        vr5,    vr3,    vr2
1600    vilvh.h        vr15,   vr3,    vr2
1601    xvpermi.q      xr15,   xr5,    0x20
1602    xvmulwev.w.h   xr0,    xr14,   xr15
1603    xvmaddwod.w.h  xr0,    xr14,   xr15
1604    xvssrarni.hu.w xr1,    xr0,    mask_sh
1605    xvssrlni.bu.h  xr2,    xr1,    0
1606    fst.s          f2,     a0,     0
1607    add.d          a0,     a0,     a1
1608    xvstelm.w      xr2,    a0,     0,    4
1609
1610    addi.d         a2,     a2,     16
1611    addi.d         a3,     a3,     16
1612    addi.d         a6,     a6,     8
1613    add.d          a0,     a0,     a1
1614    addi.w         a5,     a5,     -2
1615    blt            zero,   a5,     .MASK_W4_LASX
1616    b              .MASK_END_LASX
1617
1618.MASK_W8_LASX:
1619    xvld           xr0,    a2,      0
1620    xvld           xr1,    a3,      0
1621    vld            vr22,   a6,      0
1622
1623    vext2xv.hu.bu  xr2,    xr22
1624    xvsub.h        xr3,    xr21,    xr2
1625    xvmulwev.w.h   xr4,    xr0,     xr2
1626    xvmulwod.w.h   xr5,    xr0,     xr2
1627    xvmaddwev.w.h  xr4,    xr1,     xr3
1628    xvmaddwod.w.h  xr5,    xr1,     xr3
1629    xvssrarni.hu.w xr5,    xr4,     mask_sh
1630    xvssrlni.bu.h  xr1,    xr5,     0
1631    xvpickod.w     xr4,    xr2,     xr1
1632    xvilvl.b       xr0,    xr4,     xr1
1633    fst.d          f0,     a0,      0
1634    add.d          a0,     a0,      a1
1635    xvstelm.d      xr0,    a0,      0,    2
1636
1637    addi.d         a2,     a2,      32
1638    addi.d         a3,     a3,      32
1639    addi.d         a6,     a6,      16
1640    add.d          a0,     a0,      a1
1641    addi.w         a5,     a5,      -2
1642    blt            zero,   a5,      .MASK_W8_LASX
1643    b              .MASK_END_LASX
1644
1645.MASK_W16_LASX:
1646    xvld           xr0,    a2,      0
1647    xvld           xr1,    a3,      0
1648    vld            vr22,   a6,      0
1649
1650    vext2xv.hu.bu  xr2,    xr22
1651    xvsub.h        xr3,    xr21,    xr2
1652    xvmulwev.w.h   xr4,    xr0,     xr2
1653    xvmulwod.w.h   xr5,    xr0,     xr2
1654    xvmaddwev.w.h  xr4,    xr1,     xr3
1655    xvmaddwod.w.h  xr5,    xr1,     xr3
1656    xvssrarni.hu.w xr5,    xr4,     mask_sh
1657    xvssrlni.bu.h  xr1,    xr5,     0
1658    xvpickod.w     xr4,    xr2,    xr1
1659    xvilvl.b       xr0,    xr4,    xr1
1660    xvpermi.d      xr1,    xr0,     0xD8
1661    vst            vr1,    a0,      0
1662
1663    addi.d         a2,     a2,      32
1664    addi.d         a3,     a3,      32
1665    addi.d         a6,     a6,      16
1666    add.d          a0,     a0,      a1
1667    addi.w         a5,     a5,      -1
1668    blt            zero,   a5,      .MASK_W16_LASX
1669    b              .MASK_END_LASX
1670.MASK_W32_LASX:
1671    xvld           xr0,    a2,      0
1672    xvld           xr10,   a2,      32
1673    xvld           xr1,    a3,      0
1674    xvld           xr11,   a3,      32
1675    xvld           xr22,   a6,      0
1676    vext2xv.hu.bu  xr2,    xr22
1677    xvpermi.q      xr4,    xr22,    0x01
1678    vext2xv.hu.bu  xr12,   xr4
1679    xvsub.h        xr3,    xr21,    xr2
1680    xvsub.h        xr13,   xr21,    xr12
1681
1682    xvmulwev.w.h   xr4,    xr0,     xr2
1683    xvmulwod.w.h   xr5,    xr0,     xr2
1684    xvmulwev.w.h   xr14,   xr10,    xr12
1685    xvmulwod.w.h   xr15,   xr10,    xr12
1686    xvmaddwev.w.h  xr4,    xr1,     xr3
1687    xvmaddwod.w.h  xr5,    xr1,     xr3
1688    xvmaddwev.w.h  xr14,   xr11,    xr13
1689    xvmaddwod.w.h  xr15,   xr11,    xr13
1690    xvssrarni.hu.w xr14,   xr4,     mask_sh
1691    xvssrarni.hu.w xr15,   xr5,     mask_sh
1692    xvssrlni.bu.h  xr15,   xr14,    0
1693    xvshuf4i.w     xr6,    xr15,    0x4E
1694    xvilvl.b       xr1,    xr6,     xr15
1695    xvpermi.d      xr0,    xr1,     0xD8
1696    xvst           xr0,    a0,      0
1697
1698    addi.d         a2,     a2,      64
1699    addi.d         a3,     a3,      64
1700    addi.d         a6,     a6,      32
1701    add.d          a0,     a0,      a1
1702    addi.w         a5,     a5,      -1
1703    blt            zero,   a5,      .MASK_W32_LASX
1704    b              .MASK_END_LASX
1705
1706.MASK_W64_LASX:
1707.rept 2
1708    xvld           xr0,    a2,      0
1709    xvld           xr10,   a2,      32
1710    xvld           xr1,    a3,      0
1711    xvld           xr11,   a3,      32
1712    xvld           xr22,   a6,      0
1713    vext2xv.hu.bu  xr2,    xr22
1714    xvpermi.q      xr4,    xr22,    0x01
1715    vext2xv.hu.bu  xr12,   xr4
1716    xvsub.h        xr3,    xr21,    xr2
1717    xvsub.h        xr13,   xr21,    xr12
1718
1719    xvmulwev.w.h   xr4,    xr0,     xr2
1720    xvmulwod.w.h   xr5,    xr0,     xr2
1721    xvmulwev.w.h   xr14,   xr10,    xr12
1722    xvmulwod.w.h   xr15,   xr10,    xr12
1723    xvmaddwev.w.h  xr4,    xr1,     xr3
1724    xvmaddwod.w.h  xr5,    xr1,     xr3
1725    xvmaddwev.w.h  xr14,   xr11,    xr13
1726    xvmaddwod.w.h  xr15,   xr11,    xr13
1727    xvssrarni.hu.w xr14,   xr4,     mask_sh
1728    xvssrarni.hu.w xr15,   xr5,     mask_sh
1729    xvssrlni.bu.h  xr15,   xr14,    0
1730    xvshuf4i.w     xr6,    xr15,    0x4E
1731    xvilvl.b       xr1,    xr6,     xr15
1732    xvpermi.d      xr0,    xr1,     0xD8
1733    xvst           xr0,    a0,      0
1734    addi.d         a2,     a2,      64
1735    addi.d         a3,     a3,      64
1736    addi.d         a6,     a6,      32
1737    addi.d         a0,     a0,      32
1738.endr
1739    add.d          t8,     t8,     a1
1740    add.d          a0,     t8,     zero
1741    addi.w         a5,     a5,      -1
1742    blt            zero,   a5,      .MASK_W64_LASX
1743    b              .MASK_END_LASX
1744
1745.MASK_W128_LASX:
1746.rept 4
1747    xvld           xr0,    a2,      0
1748    xvld           xr10,   a2,      32
1749    xvld           xr1,    a3,      0
1750    xvld           xr11,   a3,      32
1751    xvld           xr22,   a6,      0
1752    vext2xv.hu.bu  xr2,    xr22
1753    xvpermi.q      xr4,    xr22,    0x01
1754    vext2xv.hu.bu  xr12,   xr4
1755    xvsub.h        xr3,    xr21,    xr2
1756    xvsub.h        xr13,   xr21,    xr12
1757
1758    xvmulwev.w.h   xr4,    xr0,     xr2
1759    xvmulwod.w.h   xr5,    xr0,     xr2
1760    xvmulwev.w.h   xr14,   xr10,    xr12
1761    xvmulwod.w.h   xr15,   xr10,    xr12
1762    xvmaddwev.w.h  xr4,    xr1,     xr3
1763    xvmaddwod.w.h  xr5,    xr1,     xr3
1764    xvmaddwev.w.h  xr14,   xr11,    xr13
1765    xvmaddwod.w.h  xr15,   xr11,    xr13
1766    xvssrarni.hu.w xr14,   xr4,     mask_sh
1767    xvssrarni.hu.w xr15,   xr5,     mask_sh
1768    xvssrlni.bu.h  xr15,   xr14,    0
1769    xvshuf4i.w     xr6,    xr15,    0x4E
1770    xvilvl.b       xr1,    xr6,     xr15
1771    xvpermi.d      xr0,    xr1,     0xD8
1772    xvst           xr0,    a0,      0
1773
1774    addi.d         a2,     a2,      64
1775    addi.d         a3,     a3,      64
1776    addi.d         a6,     a6,      32
1777    addi.d         a0,     a0,      32
1778.endr
1779    add.d          t8,     t8,     a1
1780    add.d          a0,     t8,     zero
1781    addi.w         a5,     a5,      -1
1782    blt            zero,   a5,      .MASK_W128_LASX
1783.MASK_END_LASX:
1784endfunc
1785
1786/*
1787static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
1788                     const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
1789                     uint8_t *mask, const int sign,
1790                     const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
1791*/
1792function w_mask_420_8bpc_lsx
1793    addi.d        sp,      sp,    -24
1794    fst.d         f24,     sp,    0
1795    fst.d         f25,     sp,    8
1796    fst.d         f26,     sp,    16
1797    vldi          vr20,    0x440
1798    vreplgr2vr.h  vr21,    a7
1799    vldi          vr22,    0x426
1800
1801    clz.w         t0,      a4
1802    li.w          t1,      24
1803    sub.w         t0,      t0,      t1
1804    la.local      t1,      .WMASK420_LSX_JRTABLE
1805    alsl.d        t0,      t0,      t1,    1
1806    ld.h          t8,      t0,      0
1807    add.d         t1,      t1,      t8
1808    jirl          $r0,     t1,      0
1809
1810    .align   3
1811.WMASK420_LSX_JRTABLE:
1812    .hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE
1813    .hword .WMASK420_W64_LSX  - .WMASK420_LSX_JRTABLE
1814    .hword .WMASK420_W32_LSX  - .WMASK420_LSX_JRTABLE
1815    .hword .WMASK420_W16_LSX  - .WMASK420_LSX_JRTABLE
1816    .hword .WMASK420_W8_LSX   - .WMASK420_LSX_JRTABLE
1817    .hword .WMASK420_W4_LSX   - .WMASK420_LSX_JRTABLE
1818
1819.WMASK420_W4_LSX:
1820    vld           vr0,     a2,       0
1821    vld           vr1,     a2,       16
1822    vld           vr2,     a3,       0
1823    vld           vr3,     a3,       16
1824    addi.w        a5,      a5,       -4
1825
1826    vabsd.h       vr4,     vr0,      vr2
1827    vabsd.h       vr5,     vr1,      vr3
1828    vaddi.hu      vr4,     vr4,      8
1829    vaddi.hu      vr5,     vr5,      8
1830    vsrli.h       vr4,     vr4,      8
1831    vsrli.h       vr5,     vr5,      8
1832    vadd.h        vr4,     vr4,      vr22
1833    vadd.h        vr5,     vr5,      vr22
1834    vmin.hu       vr6,     vr4,      vr20
1835    vmin.hu       vr7,     vr5,      vr20
1836    vsub.h        vr8,     vr20,     vr6
1837    vsub.h        vr9,     vr20,     vr7
1838    vmulwev.w.h   vr4,     vr6,      vr0
1839    vmulwod.w.h   vr5,     vr6,      vr0
1840    vmulwev.w.h   vr10,    vr7,      vr1
1841    vmulwod.w.h   vr11,    vr7,      vr1
1842    vmaddwev.w.h  vr4,     vr8,      vr2
1843    vmaddwod.w.h  vr5,     vr8,      vr2
1844    vmaddwev.w.h  vr10,    vr9,      vr3
1845    vmaddwod.w.h  vr11,    vr9,      vr3
1846    vilvl.w       vr0,     vr5,      vr4
1847    vilvh.w       vr1,     vr5,      vr4
1848    vilvl.w       vr2,     vr11,     vr10
1849    vilvh.w       vr3,     vr11,     vr10
1850    vssrarni.hu.w vr1,     vr0,      10
1851    vssrarni.hu.w vr3,     vr2,      10
1852    vssrlni.bu.h  vr3,     vr1,      0
1853    vstelm.w      vr3,     a0,       0,    0
1854    add.d         a0,      a0,       a1
1855    vstelm.w      vr3,     a0,       0,    1
1856    add.d         a0,      a0,       a1
1857    vstelm.w      vr3,     a0,       0,    2
1858    add.d         a0,      a0,       a1
1859    vstelm.w      vr3,     a0,       0,    3
1860    add.d         a0,      a0,       a1
1861    vpickev.h     vr0,     vr7,      vr6
1862    vpickod.h     vr1,     vr7,      vr6
1863    vadd.h        vr0,     vr0,      vr1
1864    vshuf4i.h     vr0,     vr0,      0xd8
1865    vhaddw.w.h    vr2,     vr0,      vr0
1866    vpickev.h     vr2,     vr2,      vr2
1867    vsub.h        vr2,     vr2,      vr21
1868    vaddi.hu      vr2,     vr2,      2
1869    vssrani.bu.h  vr2,     vr2,      2
1870    vstelm.w      vr2,     a6,       0,    0
1871
1872    addi.d        a2,      a2,       32
1873    addi.d        a3,      a3,       32
1874    addi.d        a6,      a6,       4
1875    blt           zero,    a5,       .WMASK420_W4_LSX
1876    b             .END_W420
1877
1878.WMASK420_W8_LSX:
1879    vld           vr0,     a2,       0
1880    vld           vr1,     a2,       16
1881    vld           vr2,     a3,       0
1882    vld           vr3,     a3,       16
1883    addi.w        a5,      a5,       -2
1884
1885    vabsd.h       vr4,     vr0,      vr2
1886    vabsd.h       vr5,     vr1,      vr3
1887    vaddi.hu      vr4,     vr4,      8
1888    vaddi.hu      vr5,     vr5,      8
1889    vsrli.h       vr4,     vr4,      8
1890    vsrli.h       vr5,     vr5,      8
1891    vadd.h        vr4,     vr4,      vr22
1892    vadd.h        vr5,     vr5,      vr22
1893    vmin.hu       vr6,     vr4,      vr20
1894    vmin.hu       vr7,     vr5,      vr20
1895    vsub.h        vr8,     vr20,     vr6
1896    vsub.h        vr9,     vr20,     vr7
1897    vmulwev.w.h   vr4,     vr6,      vr0
1898    vmulwod.w.h   vr5,     vr6,      vr0
1899    vmulwev.w.h   vr10,    vr7,      vr1
1900    vmulwod.w.h   vr11,    vr7,      vr1
1901    vmaddwev.w.h  vr4,     vr8,      vr2
1902    vmaddwod.w.h  vr5,     vr8,      vr2
1903    vmaddwev.w.h  vr10,    vr9,      vr3
1904    vmaddwod.w.h  vr11,    vr9,      vr3
1905    vssrarni.hu.w vr10,    vr4,      10
1906    vssrarni.hu.w vr11,    vr5,      10
1907    vssrlni.bu.h  vr11,    vr10,     0
1908    vshuf4i.w     vr0,     vr11,     0x4E
1909    vilvl.b       vr3,     vr0,      vr11
1910    vstelm.d      vr3,     a0,       0,     0
1911    add.d         a0,      a0,       a1
1912    vstelm.d      vr3,     a0,       0,     1
1913    add.d         a0,      a0,       a1
1914    vpickev.h     vr0,     vr7,      vr6
1915    vpickod.h     vr1,     vr7,      vr6
1916    vadd.h        vr0,     vr0,      vr1
1917    vilvh.d       vr2,     vr0,      vr0
1918    vadd.h        vr2,     vr2,      vr0
1919    vsub.h        vr2,     vr2,      vr21
1920    vaddi.hu      vr2,     vr2,      2
1921    vssrani.bu.h  vr2,     vr2,      2
1922    vstelm.w      vr2,     a6,       0,     0
1923
1924    addi.d        a2,      a2,       32
1925    addi.d        a3,      a3,       32
1926    addi.d        a6,      a6,       4
1927    blt           zero,    a5,       .WMASK420_W8_LSX
1928    b             .END_W420
1929
1930.WMASK420_W16_LSX:
1931    vld           vr0,     a2,       0
1932    vld           vr1,     a2,       16
1933    alsl.d        a2,      a4,       a2,    1
1934    vld           vr2,     a2,       0
1935    vld           vr3,     a2,       16
1936    vld           vr4,     a3,       0
1937    vld           vr5,     a3,       16
1938    alsl.d        a3,      a4,       a3,    1
1939    vld           vr6,     a3,       0
1940    vld           vr7,     a3,       16
1941
1942    vabsd.h       vr8,     vr0,      vr4
1943    vabsd.h       vr9,     vr1,      vr5
1944    vabsd.h       vr10,    vr2,      vr6
1945    vabsd.h       vr11,    vr3,      vr7
1946    vaddi.hu      vr8,     vr8,      8
1947    vaddi.hu      vr9,     vr9,      8
1948    vaddi.hu      vr10,    vr10,     8
1949    vaddi.hu      vr11,    vr11,     8
1950    vsrli.h       vr8,     vr8,      8
1951    vsrli.h       vr9,     vr9,      8
1952    vsrli.h       vr10,    vr10,     8
1953    vsrli.h       vr11,    vr11,     8
1954    vadd.h        vr8,     vr8,      vr22
1955    vadd.h        vr9,     vr9,      vr22
1956    vadd.h        vr10,    vr10,     vr22
1957    vadd.h        vr11,    vr11,     vr22
1958    vmin.hu       vr12,    vr8,      vr20
1959    vmin.hu       vr13,    vr9,      vr20
1960    vmin.hu       vr14,    vr10,     vr20
1961    vmin.hu       vr15,    vr11,     vr20
1962    vsub.h        vr16,    vr20,     vr12
1963    vsub.h        vr17,    vr20,     vr13
1964    vsub.h        vr18,    vr20,     vr14
1965    vsub.h        vr19,    vr20,     vr15
1966    vmulwev.w.h   vr8,     vr12,     vr0
1967    vmulwod.w.h   vr9,     vr12,     vr0
1968    vmulwev.w.h   vr10,    vr13,     vr1
1969    vmulwod.w.h   vr11,    vr13,     vr1
1970    vmulwev.w.h   vr23,    vr14,     vr2
1971    vmulwod.w.h   vr24,    vr14,     vr2
1972    vmulwev.w.h   vr25,    vr15,     vr3
1973    vmulwod.w.h   vr26,    vr15,     vr3
1974    vmaddwev.w.h  vr8,     vr16,     vr4
1975    vmaddwod.w.h  vr9,     vr16,     vr4
1976    vmaddwev.w.h  vr10,    vr17,     vr5
1977    vmaddwod.w.h  vr11,    vr17,     vr5
1978    vmaddwev.w.h  vr23,    vr18,     vr6
1979    vmaddwod.w.h  vr24,    vr18,     vr6
1980    vmaddwev.w.h  vr25,    vr19,     vr7
1981    vmaddwod.w.h  vr26,    vr19,     vr7
1982    vssrarni.hu.w vr10,    vr8,      10
1983    vssrarni.hu.w vr11,    vr9,      10
1984    vssrarni.hu.w vr25,    vr23,     10
1985    vssrarni.hu.w vr26,    vr24,     10
1986    vssrlni.bu.h  vr11,    vr10,     0
1987    vssrlni.bu.h  vr26,    vr25,     0
1988    vshuf4i.w     vr0,     vr11,     0x4E
1989    vshuf4i.w     vr1,     vr26,     0x4E
1990    vilvl.b       vr3,     vr0,      vr11
1991    vilvl.b       vr7,     vr1,      vr26
1992    vst           vr3,     a0,       0
1993    vstx          vr7,     a0,       a1
1994    vpickev.h     vr0,     vr13,     vr12
1995    vpickod.h     vr1,     vr13,     vr12
1996    vpickev.h     vr2,     vr15,     vr14
1997    vpickod.h     vr3,     vr15,     vr14
1998    vadd.h        vr4,     vr0,      vr1
1999    vadd.h        vr5,     vr2,      vr3
2000    vadd.h        vr4,     vr4,      vr5
2001    vsub.h        vr4,     vr4,      vr21
2002    vssrarni.bu.h vr4,     vr4,      2
2003    vstelm.d      vr4,     a6,       0,    0
2004
2005    alsl.d        a2,      a4,       a2,   1
2006    alsl.d        a3,      a4,       a3,   1
2007    alsl.d        a0,      a1,       a0,   1
2008    addi.d        a6,      a6,       8
2009    addi.w        a5,      a5,       -2
2010    blt           zero,    a5,       .WMASK420_W16_LSX
2011    b    .END_W420
2012
2013.WMASK420_W32_LSX:
2014.WMASK420_W64_LSX:
2015.WMASK420_W128_LSX:
2016
2017.LOOP_W32_420_LSX:
2018    add.d         t1,       a2,       zero
2019    add.d         t2,       a3,       zero
2020    add.d         t3,       a0,       zero
2021    add.d         t4,       a6,       zero
2022    alsl.d        t5,       a4,       t1,     1
2023    alsl.d        t6,       a4,       t2,     1
2024    or            t7,       a4,       a4
2025
2026.W32_420_LSX:
2027    vld           vr0,      t1,       0
2028    vld           vr1,      t1,       16
2029    vld           vr2,      t2,       0
2030    vld           vr3,      t2,       16
2031    vld           vr4,      t5,       0
2032    vld           vr5,      t5,       16
2033    vld           vr6,      t6,       0
2034    vld           vr7,      t6,       16
2035    addi.d        t1,       t1,       32
2036    addi.d        t2,       t2,       32
2037    addi.d        t5,       t5,       32
2038    addi.d        t6,       t6,       32
2039    addi.w        t7,       t7,       -16
2040    vabsd.h       vr8,      vr0,      vr2
2041    vabsd.h       vr9,      vr1,      vr3
2042    vabsd.h       vr10,     vr4,      vr6
2043    vabsd.h       vr11,     vr5,      vr7
2044    vaddi.hu      vr8,      vr8,      8
2045    vaddi.hu      vr9,      vr9,      8
2046    vaddi.hu      vr10,     vr10,     8
2047    vaddi.hu      vr11,     vr11,     8
2048    vsrli.h       vr8,      vr8,      8
2049    vsrli.h       vr9,      vr9,      8
2050    vsrli.h       vr10,     vr10,     8
2051    vsrli.h       vr11,     vr11,     8
2052    vadd.h        vr8,      vr8,      vr22
2053    vadd.h        vr9,      vr9,      vr22
2054    vadd.h        vr10,     vr10,     vr22
2055    vadd.h        vr11,     vr11,     vr22
2056    vmin.hu       vr12,     vr8,      vr20
2057    vmin.hu       vr13,     vr9,      vr20
2058    vmin.hu       vr14,     vr10,     vr20
2059    vmin.hu       vr15,     vr11,     vr20
2060    vsub.h        vr16,     vr20,     vr12
2061    vsub.h        vr17,     vr20,     vr13
2062    vsub.h        vr18,     vr20,     vr14
2063    vsub.h        vr19,     vr20,     vr15
2064    vmulwev.w.h   vr8,      vr12,     vr0
2065    vmulwod.w.h   vr9,      vr12,     vr0
2066    vmulwev.w.h   vr10,     vr13,     vr1
2067    vmulwod.w.h   vr11,     vr13,     vr1
2068    vmulwev.w.h   vr23,     vr14,     vr4
2069    vmulwod.w.h   vr24,     vr14,     vr4
2070    vmulwev.w.h   vr25,     vr15,     vr5
2071    vmulwod.w.h   vr26,     vr15,     vr5
2072    vmaddwev.w.h  vr8,      vr16,     vr2
2073    vmaddwod.w.h  vr9,      vr16,     vr2
2074    vmaddwev.w.h  vr10,     vr17,     vr3
2075    vmaddwod.w.h  vr11,     vr17,     vr3
2076    vmaddwev.w.h  vr23,     vr18,     vr6
2077    vmaddwod.w.h  vr24,     vr18,     vr6
2078    vmaddwev.w.h  vr25,     vr19,     vr7
2079    vmaddwod.w.h  vr26,     vr19,     vr7
2080    vssrarni.hu.w vr10,     vr8,      10
2081    vssrarni.hu.w vr11,     vr9,      10
2082    vssrarni.hu.w vr25,     vr23,     10
2083    vssrarni.hu.w vr26,     vr24,     10
2084    vssrlni.bu.h  vr11,     vr10,     0
2085    vssrlni.bu.h  vr26,     vr25,     0
2086    vshuf4i.w     vr8,      vr11,     0x4E
2087    vshuf4i.w     vr9,      vr26,     0x4E
2088    vilvl.b       vr3,      vr8,      vr11
2089    vilvl.b       vr7,      vr9,      vr26
2090    vst           vr3,      t3,       0
2091    vstx          vr7,      a1,       t3
2092    addi.d        t3,       t3,       16
2093    vpickev.h     vr8,      vr13,     vr12
2094    vpickod.h     vr9,      vr13,     vr12
2095    vpickev.h     vr10,     vr15,     vr14
2096    vpickod.h     vr11,     vr15,     vr14
2097    vadd.h        vr8,      vr8,      vr9
2098    vadd.h        vr10,     vr10,     vr11
2099    vadd.h        vr12,     vr8,      vr10
2100    vsub.h        vr12,     vr12,     vr21
2101    vssrarni.bu.h vr12,     vr12,     2
2102    vstelm.d      vr12,     t4,       0,     0
2103    addi.d        t4,       t4,       8
2104    bne           t7,       zero,     .W32_420_LSX
2105
2106    alsl.d        a2,       a4,       a2,     2
2107    alsl.d        a3,       a4,       a3,     2
2108    alsl.d        a0,       a1,       a0,     1
2109    srai.w        t8,       a4,       1
2110    add.d         a6,       a6,       t8
2111    addi.w        a5,       a5,       -2
2112    blt           zero,     a5,       .LOOP_W32_420_LSX
2113
2114.END_W420:
2115    fld.d            f24,     sp,    0
2116    fld.d            f25,     sp,    8
2117    fld.d            f26,     sp,    16
2118    addi.d           sp,      sp,    24
2119endfunc
2120
2121function w_mask_420_8bpc_lasx
2122    xvldi          xr20,    0x440
2123    xvreplgr2vr.h  xr21,    a7
2124    xvldi          xr22,    0x426
2125
2126    clz.w          t0,      a4
2127    li.w           t1,      24
2128    sub.w          t0,      t0,      t1
2129    la.local       t1,      .WMASK420_LASX_JRTABLE
2130    alsl.d         t0,      t0,      t1,    1
2131    ld.h           t8,      t0,      0
2132    add.d          t1,      t1,      t8
2133    jirl           $r0,     t1,      0
2134
2135    .align   3
2136.WMASK420_LASX_JRTABLE:
2137    .hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE
2138    .hword .WMASK420_W64_LASX  - .WMASK420_LASX_JRTABLE
2139    .hword .WMASK420_W32_LASX  - .WMASK420_LASX_JRTABLE
2140    .hword .WMASK420_W16_LASX  - .WMASK420_LASX_JRTABLE
2141    .hword .WMASK420_W8_LASX   - .WMASK420_LASX_JRTABLE
2142    .hword .WMASK420_W4_LASX   - .WMASK420_LASX_JRTABLE
2143
2144.WMASK420_W4_LASX:
2145    xvld           xr0,     a2,     0
2146    xvld           xr1,     a3,     0
2147    addi.w         a5,      a5,     -4
2148
2149    xvabsd.h       xr2,     xr0,    xr1
2150    xvaddi.hu      xr2,     xr2,    8
2151    xvsrli.h       xr2,     xr2,    8
2152    xvadd.h        xr2,     xr2,    xr22
2153    xvmin.hu       xr3,     xr2,    xr20
2154    xvsub.h        xr4,     xr20,   xr3
2155    xvmulwev.w.h   xr5,     xr3,    xr0
2156    xvmulwod.w.h   xr6,     xr3,    xr0
2157    xvmaddwev.w.h  xr5,     xr4,    xr1
2158    xvmaddwod.w.h  xr6,     xr4,    xr1
2159    xvilvl.w       xr7,     xr6,    xr5
2160    xvilvh.w       xr8,     xr6,    xr5
2161    xvssrarni.hu.w xr8,     xr7,    10
2162    xvssrlni.bu.h  xr9,     xr8,    0
2163    vstelm.w       vr9,     a0,     0,     0
2164    add.d          a0,      a0,     a1
2165    vstelm.w       vr9,     a0,     0,     1
2166    add.d          a0,      a0,     a1
2167    xvstelm.w      xr9,     a0,     0,     4
2168    add.d          a0,      a0,     a1
2169    xvstelm.w      xr9,     a0,     0,     5
2170    add.d          a0,      a0,     a1
2171
2172    xvhaddw.w.h    xr3,     xr3,    xr3
2173    xvpermi.d      xr4,     xr3,    0xb1
2174    xvadd.h        xr3,     xr3,    xr4
2175    xvpickev.h     xr3,     xr3,    xr3
2176    xvsub.h        xr3,     xr3,    xr21
2177    xvssrarni.bu.h xr3,     xr3,    2
2178    vstelm.h       vr3,     a6,     0,     0
2179    xvstelm.h      xr3,     a6,     2,     8
2180
2181    addi.d         a2,     a2,      32
2182    addi.d         a3,     a3,      32
2183    addi.d         a6,     a6,      4
2184    blt            zero,   a5,      .WMASK420_W4_LASX
2185    b              .END_W420_LASX
2186
2187.WMASK420_W8_LASX:
2188    xvld           xr0,      a2,     0
2189    xvld           xr1,      a2,     32
2190    xvld           xr2,      a3,     0
2191    xvld           xr3,      a3,     32
2192    addi.w         a5,       a5,     -4
2193
2194    xvabsd.h       xr4,      xr0,    xr2
2195    xvabsd.h       xr5,      xr1,    xr3
2196    xvaddi.hu      xr4,      xr4,    8
2197    xvaddi.hu      xr5,      xr5,    8
2198    xvsrli.h       xr4,      xr4,    8
2199    xvsrli.h       xr5,      xr5,    8
2200    xvadd.h        xr4,      xr4,    xr22
2201    xvadd.h        xr5,      xr5,    xr22
2202    xvmin.hu       xr6,      xr4,    xr20
2203    xvmin.hu       xr7,      xr5,    xr20
2204    xvsub.h        xr8,      xr20,   xr6
2205    xvsub.h        xr9,      xr20,   xr7
2206    xvmulwev.w.h   xr10,     xr6,    xr0
2207    xvmulwod.w.h   xr11,     xr6,    xr0
2208    xvmulwev.w.h   xr12,     xr7,    xr1
2209    xvmulwod.w.h   xr13,     xr7,    xr1
2210    xvmaddwev.w.h  xr10,     xr8,    xr2
2211    xvmaddwod.w.h  xr11,     xr8,    xr2
2212    xvmaddwev.w.h  xr12,     xr9,    xr3
2213    xvmaddwod.w.h  xr13,     xr9,    xr3
2214    xvssrarni.hu.w xr12,     xr10,   10
2215    xvssrarni.hu.w xr13,     xr11,   10
2216    xvssrlni.bu.h  xr13,     xr12,   0
2217    xvshuf4i.w     xr1,      xr13,   0x4E
2218    xvilvl.b       xr17,     xr1,    xr13
2219    vstelm.d       vr17,     a0,     0,     0
2220    add.d          a0,       a0,     a1
2221    xvstelm.d      xr17,     a0,     0,     2
2222    add.d          a0,       a0,     a1
2223    xvstelm.d      xr17,     a0,     0,     1
2224    add.d          a0,       a0,     a1
2225    xvstelm.d      xr17,     a0,     0,     3
2226    add.d          a0,       a0,     a1
2227
2228    xvhaddw.w.h    xr6,      xr6,    xr6
2229    xvhaddw.w.h    xr7,      xr7,    xr7
2230    xvpickev.h     xr8,      xr7,    xr6
2231    xvpermi.q      xr9,      xr8,    0x01
2232    vadd.h         vr8,      vr8,    vr9
2233    vsub.h         vr8,      vr8,    vr21
2234    vssrarni.bu.h  vr8,      vr8,    2
2235    vstelm.d       vr8,      a6,     0,    0
2236    addi.d         a2,       a2,     64
2237    addi.d         a3,       a3,     64
2238    addi.d         a6,       a6,     8
2239    blt            zero,     a5,     .WMASK420_W8_LASX
2240    b              .END_W420_LASX
2241
2242.WMASK420_W16_LASX:
2243    xvld           xr0,      a2,     0
2244    xvld           xr1,      a2,     32
2245    xvld           xr2,      a3,     0
2246    xvld           xr3,      a3,     32
2247    addi.w         a5,       a5,     -2
2248
2249    xvabsd.h       xr4,      xr0,    xr2
2250    xvabsd.h       xr5,      xr1,    xr3
2251    xvaddi.hu      xr4,      xr4,    8
2252    xvaddi.hu      xr5,      xr5,    8
2253    xvsrli.h       xr4,      xr4,    8
2254    xvsrli.h       xr5,      xr5,    8
2255    xvadd.h        xr4,      xr4,    xr22
2256    xvadd.h        xr5,      xr5,    xr22
2257    xvmin.hu       xr4,      xr4,    xr20
2258    xvmin.hu       xr5,      xr5,    xr20
2259    xvsub.h        xr6,      xr20,   xr4
2260    xvsub.h        xr7,      xr20,   xr5
2261    xvmulwev.w.h   xr8,      xr4,    xr0
2262    xvmulwod.w.h   xr9,      xr4,    xr0
2263    xvmulwev.w.h   xr10,     xr5,    xr1
2264    xvmulwod.w.h   xr11,     xr5,    xr1
2265    xvmaddwev.w.h  xr8,      xr6,    xr2
2266    xvmaddwod.w.h  xr9,      xr6,    xr2
2267    xvmaddwev.w.h  xr10,     xr7,    xr3
2268    xvmaddwod.w.h  xr11,     xr7,    xr3
2269    xvssrarni.hu.w xr10,     xr8,    10
2270    xvssrarni.hu.w xr11,     xr9,    10
2271    xvssrlni.bu.h  xr11,     xr10,   0
2272    xvshuf4i.w     xr8,      xr11,   0x4E
2273    xvilvl.b       xr15,     xr8,    xr11
2274    xvpermi.d      xr16,     xr15,   0xd8
2275    vst            vr16,     a0,     0
2276    add.d          a0,       a0,     a1
2277    xvpermi.q      xr16,     xr16,   0x01
2278    vst            vr16,     a0,     0
2279    add.d          a0,       a0,     a1
2280
2281    xvhaddw.w.h    xr4,      xr4,    xr4
2282    xvhaddw.w.h    xr5,      xr5,    xr5
2283    xvadd.h        xr4,      xr5,    xr4
2284    xvpickev.h     xr6,      xr4,    xr4
2285    xvpermi.d      xr7,      xr6,    0x08
2286    vsub.h         vr7,      vr7,    vr21
2287    vssrarni.bu.h  vr7,      vr7,    2
2288    vstelm.d       vr7,      a6,     0,    0
2289
2290    addi.d         a2,       a2,     64
2291    addi.d         a3,       a3,     64
2292    addi.d         a6,       a6,     8
2293    blt            zero,     a5,     .WMASK420_W16_LASX
2294    b              .END_W420_LASX
2295
2296.WMASK420_W32_LASX:
2297.WMASK420_W64_LASX:
2298.WMASK420_W128_LASX:
2299
2300.LOOP_W32_420_LASX:
2301    add.d          t1,       a2,       zero
2302    add.d          t2,       a3,       zero
2303    add.d          t3,       a0,       zero
2304    add.d          t4,       a6,       zero
2305    alsl.d         t5,       a4,       t1,     1
2306    alsl.d         t6,       a4,       t2,     1
2307    or             t7,       a4,       a4
2308.W32_420_LASX:
2309    xvld           xr0,      t1,       0
2310    xvld           xr1,      t2,       0
2311    xvld           xr2,      t5,       0
2312    xvld           xr3,      t6,       0
2313    addi.d         t1,       t1,       32
2314    addi.d         t2,       t2,       32
2315    addi.d         t5,       t5,       32
2316    addi.d         t6,       t6,       32
2317    addi.w         t7,       t7,       -16
2318    xvabsd.h       xr4,      xr0,      xr1
2319    xvabsd.h       xr5,      xr2,      xr3
2320    xvaddi.hu      xr4,      xr4,      8
2321    xvaddi.hu      xr5,      xr5,      8
2322    xvsrli.h       xr4,      xr4,      8
2323    xvsrli.h       xr5,      xr5,      8
2324    xvadd.h        xr4,      xr4,      xr22
2325    xvadd.h        xr5,      xr5,      xr22
2326    xvmin.hu       xr6,      xr4,      xr20
2327    xvmin.hu       xr7,      xr5,      xr20
2328    xvsub.h        xr8,      xr20,     xr6
2329    xvsub.h        xr9,      xr20,     xr7
2330    xvmulwev.w.h   xr10,     xr6,      xr0
2331    xvmulwod.w.h   xr11,     xr6,      xr0
2332    xvmulwev.w.h   xr12,     xr7,      xr2
2333    xvmulwod.w.h   xr13,     xr7,      xr2
2334    xvmaddwev.w.h  xr10,     xr8,      xr1
2335    xvmaddwod.w.h  xr11,     xr8,      xr1
2336    xvmaddwev.w.h  xr12,     xr9,      xr3
2337    xvmaddwod.w.h  xr13,     xr9,      xr3
2338    xvssrarni.hu.w xr12,     xr10,     10
2339    xvssrarni.hu.w xr13,     xr11,     10
2340    xvssrlni.bu.h  xr13,     xr12,     0
2341    xvshuf4i.w     xr10,     xr13,     0x4E
2342    xvilvl.b       xr17,     xr10,     xr13
2343    xvpermi.d      xr18,     xr17,     0x08
2344    xvpermi.d      xr19,     xr17,     0x0d
2345    vst            vr18,     t3,       0
2346    vstx           vr19,     t3,       a1
2347    addi.d         t3,       t3,       16
2348
2349    xvhaddw.w.h    xr6,      xr6,      xr6
2350    xvhaddw.w.h    xr7,      xr7,      xr7
2351    xvadd.h        xr6,      xr7,      xr6
2352    xvpickev.h     xr7,      xr6,      xr6
2353    xvpermi.d      xr8,      xr7,      0x08
2354    vsub.h         vr9,      vr8,      vr21
2355    vssrarni.bu.h  vr9,      vr9,      2
2356    vstelm.d       vr9,      t4,       0,      0
2357    addi.d         t4,       t4,       8
2358    bne            t7,       zero,     .W32_420_LASX
2359
2360    alsl.d         a2,       a4,       a2,     2
2361    alsl.d         a3,       a4,       a3,     2
2362    alsl.d         a0,       a1,       a0,     1
2363    srai.w         t8,       a4,       1
2364    add.d          a6,       a6,       t8
2365    addi.w         a5,       a5,       -2
2366    blt            zero,     a5,       .LOOP_W32_420_LASX
2367
2368.END_W420_LASX:
2369endfunc
2370
2371#undef bpc_sh
2372#undef bpcw_sh
2373
2374.macro  vhaddw.d.h  in0
2375    vhaddw.w.h  \in0,  \in0,  \in0
2376    vhaddw.d.w  \in0,  \in0,  \in0
2377.endm
2378.macro  vhaddw.q.w  in0
2379    vhaddw.d.w  \in0,  \in0,  \in0
2380    vhaddw.q.d  \in0,  \in0,  \in0
2381.endm
2382.macro PUT_H_8W in0
2383    vshuf.b          vr2,    \in0,  \in0,   vr6
2384    vshuf.b          vr3,    \in0,  \in0,   vr7
2385    vshuf.b          vr4,    \in0,  \in0,   vr8
2386    vmulwev.h.bu.b   vr12,   vr2,   vr10
2387    vmulwev.h.bu.b   vr13,   vr3,   vr11
2388    vmulwev.h.bu.b   vr14,   vr3,   vr10
2389    vmulwev.h.bu.b   vr15,   vr4,   vr11
2390    vmaddwod.h.bu.b  vr12,   vr2,   vr10
2391    vmaddwod.h.bu.b  vr13,   vr3,   vr11
2392    vmaddwod.h.bu.b  vr14,   vr3,   vr10
2393    vmaddwod.h.bu.b  vr15,   vr4,   vr11
2394    vadd.h           vr12,   vr12,  vr13
2395    vadd.h           vr14,   vr14,  vr15
2396    vhaddw.w.h       vr12,   vr12,  vr12
2397    vhaddw.w.h       vr14,   vr14,  vr14
2398    vpickev.h        \in0,   vr14,  vr12
2399    vadd.h           \in0,   \in0,  vr9
2400.endm
2401
2402const subpel_h_shuf0
2403.byte 0, 1, 2, 3, 1, 2, 3, 4, 16, 17, 18, 19, 17, 18, 19, 20
2404endconst
2405const subpel_h_shuf1
2406.byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
2407endconst
2408const subpel_h_shuf2
2409.byte 0, 1, 2, 3, 1, 2, 3, 4,  8,  9, 10, 11,  9, 10, 11, 12
2410.byte 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
2411endconst
2412const subpel_h_shuf3
2413.byte 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
2414.byte 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
2415endconst
2416
2417.macro FILTER_8TAP_8W in0
2418    vshuf.b         vr13,    \in0,  \in0,  vr7
2419    vshuf.b         vr14,    \in0,  \in0,  vr11
2420    vshuf.b         vr15,    \in0,  \in0,  vr12
2421    vmulwev.h.bu.b  vr16,    vr13,  vr8
2422    vmulwev.h.bu.b  vr17,    vr14,  vr10
2423    vmulwev.h.bu.b  vr18,    vr14,  vr8
2424    vmulwev.h.bu.b  vr19,    vr15,  vr10
2425    vmaddwod.h.bu.b vr16,    vr13,  vr8
2426    vmaddwod.h.bu.b vr17,    vr14,  vr10
2427    vmaddwod.h.bu.b vr18,    vr14,  vr8
2428    vmaddwod.h.bu.b vr19,    vr15,  vr10
2429    vadd.h          vr16,    vr16,  vr17
2430    vadd.h          vr18,    vr18,  vr19
2431    vhaddw.w.h      vr16,    vr16,  vr16
2432    vhaddw.w.h      \in0,    vr18,  vr18
2433    vssrarni.h.w    \in0,    vr16,  2
2434.endm
2435
2436.macro PUT_8TAP_8BPC_LSX lable
2437    li.w             t0,     4
2438    la.local         t6,     dav1d_mc_subpel_filters
2439    slli.d           t2,     a3,    1  //src_stride*2
2440    add.d            t3,     t2,    a3 //src_stride*3
2441    slli.d           t4,     t2,    1  //src_stride*4
2442
2443    bnez             a6,     .l_\lable\()put_h //mx
2444    bnez             a7,     .l_\lable\()put_v //my
2445
2446    clz.w            t1,     a4
2447    li.w             t5,     24
2448    sub.w            t1,     t1,    t5
2449    la.local         t5,     .l_\lable\()put_hv0_jtable
2450    alsl.d           t1,     t1,    t5,   3
2451    ld.d             t6,     t1,    0
2452    add.d            t5,     t5,    t6
2453    jirl             $r0,    t5,    0
2454
2455    .align   3
2456.l_\lable\()put_hv0_jtable:
2457    .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable
2458    .dword .l_\lable\()put_hv0_64w  - .l_\lable\()put_hv0_jtable
2459    .dword .l_\lable\()put_hv0_32w  - .l_\lable\()put_hv0_jtable
2460    .dword .l_\lable\()put_hv0_16w  - .l_\lable\()put_hv0_jtable
2461    .dword .l_\lable\()put_hv0_8w   - .l_\lable\()put_hv0_jtable
2462    .dword .l_\lable\()put_hv0_4w   - .l_\lable\()put_hv0_jtable
2463    .dword .l_\lable\()put_hv0_2w   - .l_\lable\()put_hv0_jtable
2464
2465.l_\lable\()put_hv0_2w:
2466    vldrepl.h        vr0,    a2,    0
2467    add.d            a2,     a2,    a3
2468    vldrepl.h        vr1,    a2,    0
2469    vstelm.h         vr0,    a0,    0,     0
2470    add.d            a0,     a0,    a1
2471    vstelm.h         vr1,    a0,    0,     0
2472    add.d            a2,     a2,    a3
2473    add.d            a0,     a0,    a1
2474    addi.w           a5,     a5,    -2
2475    bnez             a5,     .l_\lable\()put_hv0_2w
2476    b                .l_\lable\()end_put_8tap
2477.l_\lable\()put_hv0_4w:
2478    fld.s            f0,     a2,    0
2479    fldx.s           f1,     a2,    a3
2480    fst.s            f0,     a0,    0
2481    fstx.s           f1,     a0,    a1
2482    alsl.d           a2,     a3,    a2,    1
2483    alsl.d           a0,     a1,    a0,    1
2484    addi.w           a5,     a5,    -2
2485    bnez             a5,     .l_\lable\()put_hv0_4w
2486    b                .l_\lable\()end_put_8tap
2487.l_\lable\()put_hv0_8w:
2488    fld.d            f0,     a2,    0
2489    fldx.d           f1,     a2,    a3
2490    fst.d            f0,     a0,    0
2491    fstx.d           f1,     a0,    a1
2492    alsl.d           a2,     a3,    a2,    1
2493    alsl.d           a0,     a1,    a0,    1
2494    addi.w           a5,     a5,    -2
2495    bnez             a5,     .l_\lable\()put_hv0_8w
2496    b                .l_\lable\()end_put_8tap
2497.l_\lable\()put_hv0_16w:
2498    vld              vr0,    a2,    0
2499    vldx             vr1,    a2,    a3
2500    vst              vr0,    a0,    0
2501    vstx             vr1,    a0,    a1
2502    alsl.d           a2,     a3,    a2,    1
2503    alsl.d           a0,     a1,    a0,    1
2504    addi.w           a5,     a5,    -2
2505    bnez             a5,     .l_\lable\()put_hv0_16w
2506    b                .l_\lable\()end_put_8tap
2507.l_\lable\()put_hv0_32w:
2508    vld              vr0,    a2,    0
2509    vld              vr1,    a2,    16
2510    add.d            a2,     a2,    a3
2511    vld              vr2,    a2,    0
2512    vld              vr3,    a2,    16
2513    vst              vr0,    a0,    0
2514    vst              vr1,    a0,    16
2515    add.d            a0,     a0,    a1
2516    vst              vr2,    a0,    0
2517    vst              vr3,    a0,    16
2518    add.d            a2,     a2,    a3
2519    add.d            a0,     a0,    a1
2520    addi.w           a5,     a5,    -2
2521    bnez             a5,     .l_\lable\()put_hv0_32w
2522    b                .l_\lable\()end_put_8tap
2523.l_\lable\()put_hv0_64w:
2524    vld              vr0,    a2,    0
2525    vld              vr1,    a2,    16
2526    vld              vr2,    a2,    32
2527    vld              vr3,    a2,    48
2528    add.d            a2,     a2,    a3
2529    vld              vr4,    a2,    0
2530    vld              vr5,    a2,    16
2531    vld              vr6,    a2,    32
2532    vld              vr7,    a2,    48
2533    add.d            a2,     a2,    a3
2534    vst              vr0,    a0,    0
2535    vst              vr1,    a0,    16
2536    vst              vr2,    a0,    32
2537    vst              vr3,    a0,    48
2538    add.d            a0,     a0,    a1
2539    vst              vr4,    a0,    0
2540    vst              vr5,    a0,    16
2541    vst              vr6,    a0,    32
2542    vst              vr7,    a0,    48
2543    add.d            a0,     a0,    a1
2544    addi.w           a5,     a5,    -2
2545    bnez             a5,     .l_\lable\()put_hv0_64w
2546    b                .l_\lable\()end_put_8tap
2547.l_\lable\()put_hv0_128w:
2548    vld              vr0,    a2,    0
2549    vld              vr1,    a2,    16
2550    vld              vr2,    a2,    32
2551    vld              vr3,    a2,    48
2552    vld              vr4,    a2,    64
2553    vld              vr5,    a2,    80
2554    vld              vr6,    a2,    96
2555    vld              vr7,    a2,    112
2556    add.d            a2,     a2,    a3
2557    vld              vr8,    a2,    0
2558    vld              vr9,    a2,    16
2559    vld              vr10,   a2,    32
2560    vld              vr11,   a2,    48
2561    vld              vr12,   a2,    64
2562    vld              vr13,   a2,    80
2563    vld              vr14,   a2,    96
2564    vld              vr15,   a2,    112
2565    add.d            a2,     a2,    a3
2566    vst              vr0,    a0,    0
2567    vst              vr1,    a0,    16
2568    vst              vr2,    a0,    32
2569    vst              vr3,    a0,    48
2570    vst              vr4,    a0,    64
2571    vst              vr5,    a0,    80
2572    vst              vr6,    a0,    96
2573    vst              vr7,    a0,    112
2574    add.d            a0,     a0,    a1
2575    vst              vr8,    a0,    0
2576    vst              vr9,    a0,    16
2577    vst              vr10,   a0,    32
2578    vst              vr11,   a0,    48
2579    vst              vr12,   a0,    64
2580    vst              vr13,   a0,    80
2581    vst              vr14,   a0,    96
2582    vst              vr15,   a0,    112
2583    add.d            a0,     a0,    a1
2584    addi.w           a5,     a5,    -2
2585    bnez             a5,     .l_\lable\()put_hv0_128w
2586    b                .l_\lable\()end_put_8tap
2587
2588.l_\lable\()put_h:
2589    bnez             a7,     .l_\lable\()put_hv //if(fh) && if (fv)
2590    ld.d             t5,     sp,    0  //filter_type
2591    andi             t1,     t5,    3
2592    blt              t0,     a4,    .l_\lable\()put_h_idx_fh
2593    andi             t1,     t5,    1
2594    addi.w           t1,     t1,    3
2595
2596.l_\lable\()put_h_idx_fh:
2597    addi.w           t5,     zero,  120
2598    mul.w            t1,     t1,    t5
2599    addi.w           t5,     a6,    -1
2600    slli.w           t5,     t5,    3
2601    add.w            t1,     t1,    t5
2602    add.d            t7,     t6,    t1 //fh's offset
2603    li.w             t1,     34
2604    vreplgr2vr.h     vr9,    t1
2605
2606    clz.w            t1,     a4
2607    li.w             t5,     24
2608    sub.w            t1,     t1,    t5
2609    la.local         t5,     .l_\lable\()put_h_jtable
2610    alsl.d           t1,     t1,    t5,   3
2611    ld.d             t6,     t1,    0
2612    add.d            t5,     t5,    t6
2613    jirl             $r0,    t5,    0
2614
2615    .align   3
2616.l_\lable\()put_h_jtable:
2617    .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable
2618    .dword .l_\lable\()put_h_64w  - .l_\lable\()put_h_jtable
2619    .dword .l_\lable\()put_h_32w  - .l_\lable\()put_h_jtable
2620    .dword .l_\lable\()put_h_16w  - .l_\lable\()put_h_jtable
2621    .dword .l_\lable\()put_h_8w   - .l_\lable\()put_h_jtable
2622    .dword .l_\lable\()put_h_4w   - .l_\lable\()put_h_jtable
2623    .dword .l_\lable\()put_h_2w   - .l_\lable\()put_h_jtable
2624
2625.l_\lable\()put_h_2w:
2626    addi.d           t7,     t7,    2
2627    addi.d           a2,     a2,    -1
2628    vldrepl.w        vr8,    t7,    0
2629    la.local         t7,     subpel_h_shuf0
2630    vld              vr7,    t7,    0
2631.l_\lable\()put_h_2w_loop:
2632    vld              vr0,    a2,    0
2633    vldx             vr1,    a2,    a3
2634    add.d            a2,     a2,    t2
2635
2636    vshuf.b          vr0,    vr1,   vr0,   vr7
2637    vdp2.h.bu.b      vr1,    vr0,   vr8
2638    vhaddw.w.h       vr0,    vr1,   vr1
2639    vpickev.h        vr0,    vr0,   vr0
2640    vadd.h           vr0,    vr0,   vr9
2641    vssrani.bu.h     vr0,    vr0,   6
2642
2643    vstelm.h         vr0,    a0,    0,     0
2644    add.d            a0,     a0,    a1
2645    vstelm.h         vr0,    a0,    0,     1
2646    add.d            a0,     a0,    a1
2647    addi.w           a5,     a5,    -2
2648    bnez             a5,     .l_\lable\()put_h_2w_loop
2649    b                .l_\lable\()end_put_8tap
2650
2651.l_\lable\()put_h_4w:
2652    addi.d           t7,     t7,    2
2653    addi.d           a2,     a2,    -1
2654    vldrepl.w        vr8,    t7,    0
2655    la.local         t7,     subpel_h_shuf1
2656    vld              vr7,    t7,    0
2657.l_\lable\()put_h_4w_loop:
2658    vld              vr0,    a2,    0
2659    vldx             vr1,    a2,    a3
2660    add.d            a2,     a2,    t2
2661
2662    vshuf.b          vr0,    vr0,   vr0,   vr7
2663    vshuf.b          vr1,    vr1,   vr1,   vr7
2664    vmulwev.h.bu.b   vr2,    vr0,   vr8
2665    vmulwev.h.bu.b   vr3,    vr1,   vr8
2666    vmaddwod.h.bu.b  vr2,    vr0,   vr8
2667    vmaddwod.h.bu.b  vr3,    vr1,   vr8
2668    vhaddw.w.h       vr0,    vr2,   vr2
2669    vhaddw.w.h       vr1,    vr3,   vr3
2670    vpickev.h        vr0,    vr1,   vr0
2671    vadd.h           vr0,    vr0,   vr9
2672    vssrani.bu.h     vr0,    vr0,   6
2673
2674    vstelm.w         vr0,    a0,    0,     0
2675    add.d            a0,     a0,    a1
2676    vstelm.w         vr0,    a0,    0,     1
2677    add.d            a0,     a0,    a1
2678    addi.d           a5,     a5,    -2
2679    bnez             a5,     .l_\lable\()put_h_4w_loop
2680    b                .l_\lable\()end_put_8tap
2681
2682.l_\lable\()put_h_8w:
2683    fld.d            f10,    t7,    0
2684    vreplvei.w       vr11,   vr10,  1
2685    vreplvei.w       vr10,   vr10,  0
2686    la.local         t7,     subpel_h_shuf1
2687    vld              vr6,    t7,    0
2688    vaddi.bu         vr7,    vr6,   4
2689    vaddi.bu         vr8,    vr6,   8
2690    addi.d           a2,     a2,    -3
2691.l_\lable\()put_h_8w_loop:
2692    vld              vr0,    a2,    0
2693    vldx             vr1,    a2,    a3
2694    add.d            a2,     a2,    t2
2695    PUT_H_8W         vr0
2696    PUT_H_8W         vr1
2697    vssrani.bu.h     vr1,    vr0,   6
2698    vstelm.d         vr1,    a0,    0,    0
2699    add.d            a0,     a0,    a1
2700    vstelm.d         vr1,    a0,    0,    1
2701    add.d            a0,     a0,    a1
2702    addi.w           a5,     a5,    -2
2703    bnez             a5,     .l_\lable\()put_h_8w_loop
2704    b                .l_\lable\()end_put_8tap
2705
2706.l_\lable\()put_h_16w:
2707.l_\lable\()put_h_32w:
2708.l_\lable\()put_h_64w:
2709.l_\lable\()put_h_128w:
2710    fld.d            f10,    t7,    0
2711    vreplvei.w       vr11,   vr10,  1
2712    vreplvei.w       vr10,   vr10,  0
2713    la.local         t7,     subpel_h_shuf1
2714    vld              vr6,    t7,    0
2715    vaddi.bu         vr7,    vr6,   4
2716    vaddi.bu         vr8,    vr6,   8
2717    addi.d           a2,     a2,    -3
2718    addi.d           t0,     a2,    0 //src
2719    addi.w           t5,     a5,    0 //h
2720    addi.d           t8,     a0,    0 //dst
2721.l_\lable\()put_h_16w_loop:
2722    vld              vr0,    a2,    0
2723    vld              vr1,    a2,    8
2724    add.d            a2,     a2,    a3
2725    PUT_H_8W         vr0
2726    PUT_H_8W         vr1
2727    vssrani.bu.h     vr1,    vr0,   6
2728    vst              vr1,    a0,    0
2729    add.d            a0,     a0,    a1
2730    addi.d           a5,     a5,    -1
2731    bnez             a5,     .l_\lable\()put_h_16w_loop
2732    addi.d           a2,     t0,    16
2733    addi.d           t0,     t0,    16
2734    addi.d           a0,     t8,    16
2735    addi.d           t8,     t8,    16
2736    addi.w           a5,     t5,    0
2737    addi.w           a4,     a4,    -16
2738    bnez             a4,     .l_\lable\()put_h_16w_loop
2739    b                .l_\lable\()end_put_8tap
2740
2741.l_\lable\()put_v:
2742    ld.d             t1,     sp,    0  //filter_type
2743    srli.w           t1,     t1,    2
2744    blt              t0,     a5,    .l_\lable\()put_v_idx_fv
2745    andi             t1,     t1,    1
2746    addi.w           t1,     t1,    3
2747
2748.l_\lable\()put_v_idx_fv:
2749    addi.w           t5,     zero,  120
2750    mul.w            t1,     t1,    t5
2751    addi.w           t5,     a7,    -1
2752    slli.w           t5,     t5,    3
2753    add.w            t1,     t1,    t5
2754    add.d            t1,     t6,    t1 //fv's offset
2755    vldrepl.d        vr8,    t1,    0
2756    sub.d            a2,     a2,    t3
2757
2758    vilvl.h          vr8,    vr8,   vr8
2759    vreplvei.w       vr9,    vr8,   1
2760    vreplvei.w       vr10,   vr8,   2
2761    vreplvei.w       vr11,   vr8,   3
2762    vreplvei.w       vr8,    vr8,   0
2763
2764    clz.w            t1,     a4
2765    li.w             t5,     24
2766    sub.w            t1,     t1,    t5
2767    la.local         t5,     .l_\lable\()put_v_jtable
2768    alsl.d           t1,     t1,    t5,   3
2769    ld.d             t6,     t1,    0
2770    add.d            t5,     t5,    t6
2771    jirl             $r0,    t5,    0
2772
2773    .align   3
2774.l_\lable\()put_v_jtable:
2775    .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable
2776    .dword .l_\lable\()put_v_64w  - .l_\lable\()put_v_jtable
2777    .dword .l_\lable\()put_v_32w  - .l_\lable\()put_v_jtable
2778    .dword .l_\lable\()put_v_16w  - .l_\lable\()put_v_jtable
2779    .dword .l_\lable\()put_v_8w   - .l_\lable\()put_v_jtable
2780    .dword .l_\lable\()put_v_4w   - .l_\lable\()put_v_jtable
2781    .dword .l_\lable\()put_v_2w   - .l_\lable\()put_v_jtable
2782
2783.l_\lable\()put_v_2w:
2784    fld.s            f0,     a2,    0
2785    fldx.s           f1,     a2,    a3
2786    fldx.s           f2,     a2,    t2
2787    add.d            a2,     a2,    t3
2788    fld.s            f3,     a2,    0
2789    fldx.s           f4,     a2,    a3
2790    fldx.s           f5,     a2,    t2
2791    fldx.s           f6,     a2,    t3
2792    add.d            a2,     a2,    t4
2793
2794    vilvl.h          vr0,    vr1,   vr0 //0 1
2795    vilvl.h          vr1,    vr2,   vr1 //1 2
2796    vilvl.b          vr0,    vr1,   vr0 //01 12
2797    vilvl.h          vr2,    vr3,   vr2 //2 3
2798    vilvl.h          vr3,    vr4,   vr3 //3 4
2799    vilvl.b          vr1,    vr3,   vr2 //23 34
2800    vilvl.h          vr2,    vr5,   vr4 //4 5
2801    vilvl.h          vr3,    vr6,   vr5 //5 6
2802    vilvl.b          vr2,    vr3,   vr2 //45 56
2803.l_\lable\()put_v_2w_loop:
2804    fld.s            f7,     a2,    0
2805    vilvl.h          vr3,    vr7,   vr6 //6 7
2806    fldx.s           f6,     a2,    a3
2807    add.d            a2,     a2,    t2
2808    vilvl.h          vr4,    vr6,   vr7 //7 8
2809    vilvl.b          vr3,    vr4,   vr3 //67 78
2810
2811    vmulwev.h.bu.b   vr12,   vr0,   vr8
2812    vmulwev.h.bu.b   vr13,   vr1,   vr9
2813    vmulwev.h.bu.b   vr14,   vr2,   vr10
2814    vmulwev.h.bu.b   vr15,   vr3,   vr11
2815    vmaddwod.h.bu.b  vr12,   vr0,   vr8
2816    vmaddwod.h.bu.b  vr13,   vr1,   vr9
2817    vmaddwod.h.bu.b  vr14,   vr2,   vr10
2818    vmaddwod.h.bu.b  vr15,   vr3,   vr11
2819    vaddi.hu         vr0,    vr1,   0
2820    vaddi.hu         vr1,    vr2,   0
2821    vaddi.hu         vr2,    vr3,   0
2822    vadd.h           vr12,   vr12,  vr13
2823    vadd.h           vr12,   vr12,  vr14
2824    vadd.h           vr12,   vr12,  vr15
2825
2826    vssrarni.bu.h    vr12,   vr12,  6
2827    vstelm.h         vr12,   a0,    0,   0
2828    add.d            a0,     a0,    a1
2829    vstelm.h         vr12,   a0,    0,   1
2830    add.d            a0,     a0,    a1
2831    addi.w           a5,     a5,    -2
2832    bnez             a5,     .l_\lable\()put_v_2w_loop
2833    b                .l_\lable\()end_put_8tap
2834
2835.l_\lable\()put_v_4w:
2836    fld.s            f0,     a2,    0
2837    fldx.s           f1,     a2,    a3
2838    fldx.s           f2,     a2,    t2
2839    add.d            a2,     a2,    t3
2840    fld.s            f3,     a2,    0
2841    fldx.s           f4,     a2,    a3
2842    fldx.s           f5,     a2,    t2
2843    fldx.s           f6,     a2,    t3
2844    add.d            a2,     a2,    t4
2845
2846    vilvl.w          vr0,    vr1,   vr0
2847    vilvl.w          vr1,    vr2,   vr1
2848    vilvl.b          vr0,    vr1,   vr0
2849    vilvl.w          vr1,    vr3,   vr2
2850    vilvl.w          vr2,    vr4,   vr3
2851    vilvl.b          vr1,    vr2,   vr1
2852    vilvl.w          vr2,    vr5,   vr4
2853    vilvl.w          vr3,    vr6,   vr5
2854    vilvl.b          vr2,    vr3,   vr2
2855.l_\lable\()put_v_4w_loop:
2856    fld.s            f7,     a2,    0
2857
2858    vilvl.w          vr3,    vr7,   vr6
2859    fldx.s           f6,     a2,    a3
2860    add.d            a2,     a2,    t2
2861    vilvl.w          vr4,    vr6,   vr7
2862    vilvl.b          vr3,    vr4,   vr3
2863
2864    vmulwev.h.bu.b   vr12,   vr0,   vr8
2865    vmulwev.h.bu.b   vr13,   vr1,   vr9
2866    vmulwev.h.bu.b   vr14,   vr2,   vr10
2867    vmulwev.h.bu.b   vr15,   vr3,   vr11
2868    vmaddwod.h.bu.b  vr12,   vr0,   vr8
2869    vmaddwod.h.bu.b  vr13,   vr1,   vr9
2870    vmaddwod.h.bu.b  vr14,   vr2,   vr10
2871    vmaddwod.h.bu.b  vr15,   vr3,   vr11
2872    vaddi.hu         vr0,    vr1,   0
2873    vaddi.hu         vr1,    vr2,   0
2874    vaddi.hu         vr2,    vr3,   0
2875    vadd.h           vr12,   vr12,  vr13
2876    vadd.h           vr12,   vr12,  vr14
2877    vadd.h           vr12,   vr12,  vr15
2878
2879    vssrarni.bu.h    vr12,   vr12,  6
2880    vstelm.w         vr12,   a0,    0,   0
2881    add.d            a0,     a0,    a1
2882    vstelm.w         vr12,   a0,    0,   1
2883    add.d            a0,     a0,    a1
2884    addi.w           a5,     a5,    -2
2885    bnez             a5,     .l_\lable\()put_v_4w_loop
2886    b                .l_\lable\()end_put_8tap
2887
2888.l_\lable\()put_v_8w:
2889.l_\lable\()put_v_16w:
2890.l_\lable\()put_v_32w:
2891.l_\lable\()put_v_64w:
2892.l_\lable\()put_v_128w:
2893    addi.d           t0,     a2,    0 //src
2894    addi.d           t5,     a5,    0 //h
2895    addi.d           t8,     a0,    0 //dst
2896.l_\lable\()put_v_8w_loop0:
2897    fld.d            f0,     a2,    0
2898    fldx.d           f1,     a2,    a3
2899    fldx.d           f2,     a2,    t2
2900    add.d            a2,     a2,    t3
2901    fld.d            f3,     a2,    0
2902    fldx.d           f4,     a2,    a3
2903    fldx.d           f5,     a2,    t2
2904    fldx.d           f6,     a2,    t3
2905    add.d            a2,     a2,    t4
2906
2907    vilvl.b          vr0,    vr1,   vr0 //0 1
2908    vilvl.b          vr1,    vr2,   vr1 //1 2
2909    vilvl.b          vr2,    vr3,   vr2 //2 3
2910    vilvl.b          vr3,    vr4,   vr3 //3 4
2911    vilvl.b          vr4,    vr5,   vr4 //4 5
2912    vilvl.b          vr5,    vr6,   vr5 //5 6
2913.l_\lable\()put_v_8w_loop:
2914    fld.d            f7,     a2,    0
2915    vilvl.b          vr12,   vr7,   vr6 //6 7
2916    fldx.d           f6,     a2,    a3
2917    add.d            a2,     a2,    t2
2918    vilvl.b          vr13,   vr6,   vr7 //7 8
2919
2920    vmulwev.h.bu.b   vr14,   vr0,   vr8
2921    vmulwev.h.bu.b   vr15,   vr1,   vr8
2922    vmulwev.h.bu.b   vr16,   vr2,   vr9
2923    vmulwev.h.bu.b   vr17,   vr3,   vr9
2924    vmulwev.h.bu.b   vr18,   vr4,   vr10
2925    vmulwev.h.bu.b   vr19,   vr5,   vr10
2926    vmulwev.h.bu.b   vr20,   vr12,  vr11
2927    vmulwev.h.bu.b   vr21,   vr13,  vr11
2928    vmaddwod.h.bu.b  vr14,   vr0,   vr8
2929    vmaddwod.h.bu.b  vr15,   vr1,   vr8
2930    vmaddwod.h.bu.b  vr16,   vr2,   vr9
2931    vmaddwod.h.bu.b  vr17,   vr3,   vr9
2932    vmaddwod.h.bu.b  vr18,   vr4,   vr10
2933    vmaddwod.h.bu.b  vr19,   vr5,   vr10
2934    vmaddwod.h.bu.b  vr20,   vr12,  vr11
2935    vmaddwod.h.bu.b  vr21,   vr13,  vr11
2936
2937    vaddi.hu         vr0,    vr2,   0
2938    vaddi.hu         vr1,    vr3,   0
2939    vaddi.hu         vr2,    vr4,   0
2940    vaddi.hu         vr3,    vr5,   0
2941    vaddi.hu         vr4,    vr12,  0
2942    vaddi.hu         vr5,    vr13,  0
2943    vadd.h           vr14,   vr14,  vr16
2944    vadd.h           vr14,   vr14,  vr18
2945    vadd.h           vr14,   vr14,  vr20
2946    vadd.h           vr15,   vr15,  vr17
2947    vadd.h           vr15,   vr15,  vr19
2948    vadd.h           vr15,   vr15,  vr21
2949
2950    vssrarni.bu.h    vr15,   vr14,  6
2951    vstelm.d         vr15,   a0,    0,   0
2952    add.d            a0,     a0,    a1
2953    vstelm.d         vr15,   a0,    0,   1
2954    add.d            a0,     a0,    a1
2955    addi.w           a5,     a5,    -2
2956    bnez             a5,     .l_\lable\()put_v_8w_loop
2957    addi.d           a2,     t0,    8
2958    addi.d           t0,     t0,    8
2959    addi.d           a0,     t8,    8
2960    addi.d           t8,     t8,    8
2961    addi.d           a5,     t5,    0
2962    addi.w           a4,     a4,    -8
2963    bnez             a4,     .l_\lable\()put_v_8w_loop0
2964    b                .l_\lable\()end_put_8tap
2965
2966.l_\lable\()put_hv:
2967    ld.d             t5,     sp,    0  //filter_type
2968    andi             t1,     t5,    3
2969    blt              t0,     a4,    .l_\lable\()put_hv_idx_fh
2970    andi             t1,     t5,    1
2971    addi.w           t1,     t1,    3
2972.l_\lable\()put_hv_idx_fh:
2973    addi.w           t5,     zero,  120
2974    mul.w            t1,     t1,    t5
2975    addi.w           t5,     a6,    -1
2976    slli.w           t5,     t5,    3
2977    add.w            t1,     t1,    t5
2978    add.d            t1,     t6,    t1 //fh's offset
2979    vldrepl.d        vr8,    t1,    0
2980    ld.d             t1,     sp,    0  //filter_type
2981    srli.w           t1,     t1,    2
2982    blt              t0,     a5,    .l_\lable\()put_hv_idx_fv
2983    andi             t1,     t1,    1
2984    addi.w           t1,     t1,    3
2985.l_\lable\()put_hv_idx_fv:
2986    addi.w           t5,     zero,  120
2987    mul.w            t1,     t1,    t5
2988    addi.w           t5,     a7,    -1
2989    slli.w           t5,     t5,    3
2990    add.w            t1,     t1,    t5
2991    add.d            t1,     t6,    t1 //fv's offset
2992    vldrepl.d        vr9,    t1,    0
2993    vexth.h.b        vr9,    vr9
2994
2995    sub.d            a2,     a2,    t3
2996    addi.d           a2,     a2,    -3
2997
2998    clz.w            t1,     a4
2999    li.w             t5,     24
3000    sub.w            t1,     t1,    t5
3001    la.local         t5,     .l_\lable\()put_hv_jtable
3002    alsl.d           t1,     t1,    t5,   3
3003    ld.d             t6,     t1,    0
3004    add.d            t5,     t5,    t6
3005    jirl             $r0,    t5,    0
3006
3007    .align   3
3008.l_\lable\()put_hv_jtable:
3009    .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable
3010    .dword .l_\lable\()put_hv_64w  - .l_\lable\()put_hv_jtable
3011    .dword .l_\lable\()put_hv_32w  - .l_\lable\()put_hv_jtable
3012    .dword .l_\lable\()put_hv_16w  - .l_\lable\()put_hv_jtable
3013    .dword .l_\lable\()put_hv_8w   - .l_\lable\()put_hv_jtable
3014    .dword .l_\lable\()put_hv_4w   - .l_\lable\()put_hv_jtable
3015    .dword .l_\lable\()put_hv_2w   - .l_\lable\()put_hv_jtable
3016
3017.l_\lable\()put_hv_2w:
3018    addi.d           a2,     a2,    2
3019    vld              vr0,    a2,    0
3020    vldx             vr1,    a2,    a3
3021    vldx             vr2,    a2,    t2
3022    add.d            a2,     a2,    t3
3023    vld              vr3,    a2,    0
3024    vldx             vr4,    a2,    a3
3025    vldx             vr5,    a2,    t2
3026    vldx             vr6,    a2,    t3
3027    add.d            a2,     a2,    t4
3028
3029    la.local         t1,     subpel_h_shuf0
3030    vld              vr7,    t1,    0
3031    vbsrl.v          vr8,    vr8,   2
3032    vreplvei.w       vr8,    vr8,   0
3033
3034    //fv
3035    vreplvei.w       vr14,   vr9,   1
3036    vreplvei.w       vr15,   vr9,   2
3037    vreplvei.w       vr16,   vr9,   3
3038    vreplvei.w       vr9,    vr9,   0
3039
3040    vshuf.b          vr0,    vr1,   vr0,  vr7
3041    vshuf.b          vr1,    vr3,   vr2,  vr7
3042    vshuf.b          vr2,    vr5,   vr4,  vr7
3043    vshuf.b          vr3,    vr6,   vr6,  vr7
3044    vmulwev.h.bu.b   vr10,   vr0,   vr8
3045    vmulwev.h.bu.b   vr11,   vr1,   vr8
3046    vmulwev.h.bu.b   vr12,   vr2,   vr8
3047    vmulwev.h.bu.b   vr13,   vr3,   vr8
3048    vmaddwod.h.bu.b  vr10,   vr0,   vr8
3049    vmaddwod.h.bu.b  vr11,   vr1,   vr8
3050    vmaddwod.h.bu.b  vr12,   vr2,   vr8
3051    vmaddwod.h.bu.b  vr13,   vr3,   vr8
3052    vhaddw.w.h       vr0,    vr10,  vr10
3053    vhaddw.w.h       vr1,    vr11,  vr11
3054    vssrarni.h.w     vr1,    vr0,   2 //h0 h1 h2 h3
3055    vhaddw.w.h       vr2,    vr12,  vr12
3056    vhaddw.w.h       vr3,    vr13,  vr13
3057    vssrarni.h.w     vr3,    vr2,   2 //h4 h5 h6 ~
3058    vbsrl.v          vr2,    vr1,   4
3059    vextrins.w       vr2,    vr3,   0x30 //h1 h2 h3 h4
3060    vilvl.h          vr4,    vr2,   vr1 //h0 h1 h1 h2 --
3061    vilvh.h          vr5,    vr2,   vr1 //h2 h3 h3 h4 --
3062    vbsrl.v          vr6,    vr3,   4
3063    vilvl.h          vr6,    vr6,   vr3 //h4 h5 h5 h6 --
3064    vbsrl.v          vr3,    vr3,   8  //h6 ~
3065.l_\lable\()put_hv_2w_loop:
3066    vld              vr0,    a2,    0
3067    vldx             vr2,    a2,    a3
3068    add.d            a2,     a2,    t2
3069    vshuf.b          vr0,    vr2,   vr0,  vr7
3070    vdp2.h.bu.b      vr17,   vr0,   vr8
3071    vhaddw.w.h       vr17,   vr17,  vr17
3072    vssrarni.h.w     vr17,   vr17,  2 //h7 h8
3073    vextrins.w       vr3,    vr17,  0x10 //h6 h7
3074    vilvl.h          vr3,    vr17,  vr3  //h6 h7 h7 h8 --
3075
3076    vmulwev.w.h      vr18,   vr4,   vr9
3077    vmulwev.w.h      vr19,   vr5,   vr14
3078    vmulwev.w.h      vr20,   vr6,   vr15
3079    vmulwev.w.h      vr21,   vr3,   vr16
3080    vmaddwod.w.h     vr18,   vr4,   vr9
3081    vmaddwod.w.h     vr19,   vr5,   vr14
3082    vmaddwod.w.h     vr20,   vr6,   vr15
3083    vmaddwod.w.h     vr21,   vr3,   vr16
3084    vaddi.hu         vr4,    vr5,   0
3085    vaddi.hu         vr5,    vr6,   0
3086    vaddi.hu         vr6,    vr3,   0
3087    vbsrl.v          vr3,    vr17,  4 //h8 ~
3088    vadd.w           vr18,   vr18,  vr19
3089    vadd.w           vr18,   vr18,  vr20
3090    vadd.w           vr18,   vr18,  vr21
3091
3092    vssrarni.hu.w    vr0,    vr18,  10
3093    vssrani.bu.h     vr0,    vr0,   0
3094    vstelm.h         vr0,    a0,    0,   0
3095    add.d            a0,     a0,    a1
3096    vstelm.h         vr0,    a0,    0,   1
3097    add.d            a0,     a0,    a1
3098    addi.d           a5,     a5,    -2
3099    bnez             a5,     .l_\lable\()put_hv_2w_loop
3100    b                .l_\lable\()end_put_8tap
3101
3102.l_\lable\()put_hv_4w:
3103    addi.d           a2,     a2,    2 //ignore leading 0
3104    vld              vr0,    a2,    0
3105    vldx             vr1,    a2,    a3
3106    vldx             vr2,    a2,    t2
3107    add.d            a2,     a2,    t3
3108    vld              vr3,    a2,    0
3109    vldx             vr4,    a2,    a3
3110    vldx             vr5,    a2,    t2
3111    vldx             vr6,    a2,    t3
3112    add.d            a2,     a2,    t4
3113
3114    la.local         t1,     subpel_h_shuf1
3115    vld              vr7,    t1,    0
3116    vbsrl.v          vr8,    vr8,   2
3117    vreplvei.w       vr8,    vr8,   0
3118
3119    //fv
3120    vreplvei.w       vr17,   vr9,   0
3121    vreplvei.w       vr18,   vr9,   1
3122    vreplvei.w       vr19,   vr9,   2
3123    vreplvei.w       vr20,   vr9,   3
3124
3125    //DAV1D_FILTER_8TAP_RND
3126    vshuf.b          vr0,    vr0,   vr0,  vr7
3127    vshuf.b          vr1,    vr1,   vr1,  vr7
3128    vshuf.b          vr2,    vr2,   vr2,  vr7
3129    vshuf.b          vr3,    vr3,   vr3,  vr7
3130    vshuf.b          vr4,    vr4,   vr4,  vr7
3131    vshuf.b          vr5,    vr5,   vr5,  vr7
3132    vshuf.b          vr6,    vr6,   vr6,  vr7
3133
3134    vmulwev.h.bu.b   vr10,   vr0,   vr8
3135    vmulwev.h.bu.b   vr11,   vr1,   vr8
3136    vmulwev.h.bu.b   vr12,   vr2,   vr8
3137    vmulwev.h.bu.b   vr13,   vr3,   vr8
3138    vmulwev.h.bu.b   vr14,   vr4,   vr8
3139    vmulwev.h.bu.b   vr15,   vr5,   vr8
3140    vmulwev.h.bu.b   vr16,   vr6,   vr8
3141    vmaddwod.h.bu.b  vr10,   vr0,   vr8
3142    vmaddwod.h.bu.b  vr11,   vr1,   vr8
3143    vmaddwod.h.bu.b  vr12,   vr2,   vr8
3144    vmaddwod.h.bu.b  vr13,   vr3,   vr8
3145    vmaddwod.h.bu.b  vr14,   vr4,   vr8
3146    vmaddwod.h.bu.b  vr15,   vr5,   vr8
3147    vmaddwod.h.bu.b  vr16,   vr6,   vr8
3148
3149    vhaddw.w.h       vr10,   vr10,  vr10
3150    vhaddw.w.h       vr11,   vr11,  vr11
3151    vhaddw.w.h       vr12,   vr12,  vr12
3152    vhaddw.w.h       vr13,   vr13,  vr13
3153    vhaddw.w.h       vr14,   vr14,  vr14
3154    vhaddw.w.h       vr15,   vr15,  vr15
3155    vhaddw.w.h       vr16,   vr16,  vr16
3156
3157    vssrarni.h.w     vr10,   vr10,  2 //h0
3158    vssrarni.h.w     vr11,   vr11,  2 //h1
3159    vssrarni.h.w     vr12,   vr12,  2 //h2
3160    vssrarni.h.w     vr13,   vr13,  2 //h3
3161    vssrarni.h.w     vr14,   vr14,  2 //h4
3162    vssrarni.h.w     vr15,   vr15,  2 //h5
3163    vssrarni.h.w     vr16,   vr16,  2 //h6
3164
3165    //h0
3166    vilvl.h          vr0,    vr11,  vr10 //01
3167    vilvl.h          vr1,    vr13,  vr12 //23
3168    vilvl.h          vr2,    vr15,  vr14 //45
3169    //h1
3170    vilvl.h          vr4,    vr12,  vr11 //12
3171    vilvl.h          vr5,    vr14,  vr13 //34
3172    vilvl.h          vr6,    vr16,  vr15 //56
3173
3174.l_\lable\()put_hv_4w_loop:
3175    vld              vr9,    a2,    0
3176    vldx             vr10,   a2,    a3
3177    add.d            a2,     a2,    t2
3178
3179    //DAV1D_FILTER_8TAP_CLIP
3180    vshuf.b          vr9,    vr9,   vr9,  vr7
3181    vshuf.b          vr10,   vr10,  vr10, vr7
3182    vmulwev.h.bu.b   vr11,   vr9,   vr8
3183    vmulwev.h.bu.b   vr12,   vr10,  vr8
3184    vmaddwod.h.bu.b  vr11,   vr9,   vr8
3185    vmaddwod.h.bu.b  vr12,   vr10,  vr8
3186    vhaddw.w.h       vr11,   vr11,  vr11
3187    vhaddw.w.h       vr12,   vr12,  vr12
3188    vssrarni.h.w     vr11,   vr11,  2 //h7
3189    vssrarni.h.w     vr12,   vr12,  2 //h8
3190    vilvl.h          vr3,    vr11,  vr16 //67
3191    vilvl.h          vr13,   vr12,  vr11 //78
3192
3193    vmulwev.w.h      vr9,    vr0,   vr17
3194    vmulwev.w.h      vr10,   vr1,   vr18
3195    vmulwev.w.h      vr14,   vr2,   vr19
3196    vmulwev.w.h      vr15,   vr3,   vr20
3197    vmaddwod.w.h     vr9,    vr0,   vr17
3198    vmaddwod.w.h     vr10,   vr1,   vr18
3199    vmaddwod.w.h     vr14,   vr2,   vr19
3200    vmaddwod.w.h     vr15,   vr3,   vr20
3201    vadd.w           vr16,   vr9,   vr10
3202    vadd.w           vr16,   vr16,  vr14
3203    vadd.w           vr16,   vr16,  vr15
3204
3205    vmulwev.w.h      vr9,    vr4,   vr17
3206    vmulwev.w.h      vr10,   vr5,   vr18
3207    vmulwev.w.h      vr14,   vr6,   vr19
3208    vmulwev.w.h      vr15,   vr13,  vr20
3209    vmaddwod.w.h     vr9,    vr4,   vr17
3210    vmaddwod.w.h     vr10,   vr5,   vr18
3211    vmaddwod.w.h     vr14,   vr6,   vr19
3212    vmaddwod.w.h     vr15,   vr13,  vr20
3213    vadd.w           vr21,   vr9,   vr10
3214    vadd.w           vr21,   vr21,  vr14
3215    vadd.w           vr21,   vr21,  vr15
3216
3217    vssrarni.hu.w    vr21,   vr16,  10
3218    vssrani.bu.h     vr21,   vr21,  0
3219    //cache
3220    vaddi.hu         vr0,    vr1,   0
3221    vaddi.hu         vr1,    vr2,   0
3222    vaddi.hu         vr2,    vr3,   0
3223    vaddi.hu         vr4,    vr5,   0
3224    vaddi.hu         vr5,    vr6,   0
3225    vaddi.hu         vr6,    vr13,  0
3226    vaddi.hu         vr16,   vr12,  0
3227
3228    vstelm.w         vr21,   a0,    0,    0
3229    add.d            a0,     a0,    a1
3230    vstelm.w         vr21,   a0,    0,    1
3231    add.d            a0,     a0,    a1
3232    addi.w           a5,     a5,    -2
3233    bnez             a5,     .l_\lable\()put_hv_4w_loop
3234    b                .l_\lable\()end_put_8tap
3235
3236.l_\lable\()put_hv_8w:
3237.l_\lable\()put_hv_16w:
3238.l_\lable\()put_hv_32w:
3239.l_\lable\()put_hv_64w:
3240.l_\lable\()put_hv_128w:
3241    addi.d          sp,      sp,    -8*8
3242    fst.d           f24,     sp,    0
3243    fst.d           f25,     sp,    8
3244    fst.d           f26,     sp,    16
3245    fst.d           f27,     sp,    24
3246    fst.d           f28,     sp,    32
3247    fst.d           f29,     sp,    40
3248    fst.d           f30,     sp,    48
3249    fst.d           f31,     sp,    56
3250    addi.d          t0,      a2,    0 //src
3251    addi.d          t5,      a5,    0 //h
3252    addi.d          t8,      a0,    0 //dst
3253    la.local        t1,      subpel_h_shuf1
3254    vld             vr7,     t1,    0
3255    vaddi.bu        vr11,    vr7,   4
3256    vaddi.bu        vr12,    vr7,   8
3257    vreplvei.w      vr10,    vr8,   1
3258    vreplvei.w      vr8,     vr8,   0
3259    vreplvei.w      vr20,    vr9,   1
3260    vreplvei.w      vr21,    vr9,   2
3261    vreplvei.w      vr22,    vr9,   3
3262    vreplvei.w      vr9,     vr9,   0
3263.l_\lable\()put_hv_8w_loop0:
3264    vld             vr0,     a2,    0
3265    vldx            vr1,     a2,    a3
3266    vldx            vr2,     a2,    t2
3267    add.d           a2,      a2,    t3
3268    vld             vr3,     a2,    0
3269    vldx            vr4,     a2,    a3
3270    vldx            vr5,     a2,    t2
3271    vldx            vr6,     a2,    t3
3272    add.d           a2,      a2,    t4
3273
3274    FILTER_8TAP_8W  vr0 //h0
3275    FILTER_8TAP_8W  vr1 //h1
3276    FILTER_8TAP_8W  vr2 //h2
3277    FILTER_8TAP_8W  vr3 //h3
3278    FILTER_8TAP_8W  vr4 //h4
3279    FILTER_8TAP_8W  vr5 //h5
3280    FILTER_8TAP_8W  vr6 //h6
3281
3282    //h0' low part
3283    vilvl.h         vr23,    vr1,   vr0 //01
3284    vilvl.h         vr24,    vr3,   vr2 //23
3285    vilvl.h         vr25,    vr5,   vr4 //45
3286    //h0' high part
3287    vilvh.h         vr26,    vr1,   vr0 //01
3288    vilvh.h         vr27,    vr3,   vr2 //23
3289    vilvh.h         vr28,    vr5,   vr4 //45
3290
3291    //h1' low part
3292    vilvl.h         vr29,    vr2,   vr1 //12
3293    vilvl.h         vr30,    vr4,   vr3 //34
3294    vilvl.h         vr31,    vr6,   vr5 //56
3295    //h1' high part
3296    vilvh.h         vr0,     vr2,   vr1 //12
3297    vilvh.h         vr1,     vr4,   vr3 //34
3298    vilvh.h         vr2,     vr6,   vr5 //56
3299
3300.l_\lable\()put_hv_8w_loop:
3301    vld             vr3,     a2,    0
3302    vldx            vr4,     a2,    a3
3303    add.d           a2,      a2,    t2
3304
3305    FILTER_8TAP_8W  vr3 //h7
3306    FILTER_8TAP_8W  vr4 //h8
3307
3308    //h0' low part
3309    vilvl.h         vr16,    vr3,   vr6 //67 ~low
3310    vmulwev.w.h     vr13,    vr23,  vr9
3311    vmulwev.w.h     vr14,    vr24,  vr20
3312    vmulwev.w.h     vr15,    vr25,  vr21
3313    vmulwev.w.h     vr17,    vr16,  vr22
3314    vmaddwod.w.h    vr13,    vr23,  vr9
3315    vmaddwod.w.h    vr14,    vr24,  vr20
3316    vmaddwod.w.h    vr15,    vr25,  vr21
3317    vmaddwod.w.h    vr17,    vr16,  vr22
3318    vadd.w          vr13,    vr13,  vr14
3319    vadd.w          vr13,    vr13,  vr15
3320    vadd.w          vr13,    vr13,  vr17
3321    //cache
3322    vaddi.hu        vr23,    vr24,  0
3323    vaddi.hu        vr24,    vr25,  0
3324    vaddi.hu        vr25,    vr16,  0
3325
3326    //h0' high part
3327    vilvh.h         vr17,    vr3,   vr6 //67 ~high
3328    vmulwev.w.h     vr14,    vr26,  vr9
3329    vmulwev.w.h     vr15,    vr27,  vr20
3330    vmulwev.w.h     vr16,    vr28,  vr21
3331    vmulwev.w.h     vr18,    vr17,  vr22
3332    vmaddwod.w.h    vr14,    vr26,  vr9
3333    vmaddwod.w.h    vr15,    vr27,  vr20
3334    vmaddwod.w.h    vr16,    vr28,  vr21
3335    vmaddwod.w.h    vr18,    vr17,  vr22
3336    vadd.w          vr14,    vr14,  vr15
3337    vadd.w          vr14,    vr14,  vr16
3338    vadd.w          vr14,    vr14,  vr18
3339    vssrarni.hu.w   vr14,    vr13,  10
3340    vssrarni.bu.h   vr5,     vr14,  0
3341    vstelm.d        vr5,     a0,    0,   0
3342    add.d           a0,      a0,    a1
3343    //cache
3344    vaddi.hu        vr26,    vr27,  0
3345    vaddi.hu        vr27,    vr28,  0
3346    vaddi.hu        vr28,    vr17,  0
3347    vaddi.hu        vr6,     vr4,   0
3348
3349    vilvl.h         vr5,     vr4,   vr3 //78 ~low
3350    vilvh.h         vr4,     vr4,   vr3 //78 ~high
3351
3352    //h1' low part
3353    vmulwev.w.h     vr13,    vr29,  vr9
3354    vmulwev.w.h     vr14,    vr30,  vr20
3355    vmulwev.w.h     vr15,    vr31,  vr21
3356    vmulwev.w.h     vr16,    vr5,   vr22
3357    vmaddwod.w.h    vr13,    vr29,  vr9
3358    vmaddwod.w.h    vr14,    vr30,  vr20
3359    vmaddwod.w.h    vr15,    vr31,  vr21
3360    vmaddwod.w.h    vr16,    vr5,   vr22
3361    vadd.w          vr13,    vr13,  vr14
3362    vadd.w          vr13,    vr13,  vr15
3363    vadd.w          vr13,    vr13,  vr16
3364    //cache
3365    vaddi.hu        vr29,    vr30,  0
3366    vaddi.hu        vr30,    vr31,  0
3367    vaddi.hu        vr31,    vr5,   0
3368
3369    //h1' high part
3370    vmulwev.w.h     vr14,    vr0,   vr9
3371    vmulwev.w.h     vr15,    vr1,   vr20
3372    vmulwev.w.h     vr16,    vr2,   vr21
3373    vmulwev.w.h     vr17,    vr4,   vr22
3374    vmaddwod.w.h    vr14,    vr0,   vr9
3375    vmaddwod.w.h    vr15,    vr1,   vr20
3376    vmaddwod.w.h    vr16,    vr2,   vr21
3377    vmaddwod.w.h    vr17,    vr4,   vr22
3378    vadd.w          vr14,    vr14,  vr15
3379    vadd.w          vr14,    vr14,  vr16
3380    vadd.w          vr14,    vr14,  vr17
3381    vssrarni.hu.w   vr14,    vr13,  10
3382    vssrarni.bu.h   vr5,     vr14,  0
3383    vstelm.d        vr5,     a0,    0,   0
3384    add.d           a0,      a0,    a1
3385    //cache
3386    vaddi.hu        vr0,     vr1,   0
3387    vaddi.hu        vr1,     vr2,   0
3388    vaddi.hu        vr2,     vr4,   0
3389
3390    addi.w          a5,      a5,    -2
3391    bnez            a5,      .l_\lable\()put_hv_8w_loop
3392    addi.d          a2,      t0,    8
3393    addi.d          t0,      t0,    8
3394    addi.d          a0,      t8,    8
3395    addi.d          t8,      t8,    8
3396    addi.d          a5,      t5,    0
3397    addi.w          a4,      a4,    -8
3398    bnez            a4,      .l_\lable\()put_hv_8w_loop0
3399    fld.d           f24,     sp,    0
3400    fld.d           f25,     sp,    8
3401    fld.d           f26,     sp,    16
3402    fld.d           f27,     sp,    24
3403    fld.d           f28,     sp,    32
3404    fld.d           f29,     sp,    40
3405    fld.d           f30,     sp,    48
3406    fld.d           f31,     sp,    56
3407    addi.d          sp,      sp,    8*8
3408.l_\lable\()end_put_8tap:
3409.endm
3410
3411function put_8tap_regular_8bpc_lsx
3412    addi.d   sp, sp,  -16
3413    st.d   zero, sp,  0
3414    PUT_8TAP_8BPC_LSX 0
3415    addi.d   sp, sp,  16
3416endfunc
3417
3418function put_8tap_smooth_regular_8bpc_lsx
3419    addi.d   sp, sp,  -16
3420    li.w     t0, 1
3421    st.d     t0, sp,  0
3422    PUT_8TAP_8BPC_LSX 1
3423    addi.d   sp, sp,  16
3424endfunc
3425
3426function put_8tap_sharp_regular_8bpc_lsx
3427    addi.d   sp, sp,  -16
3428    li.w     t0, 2
3429    st.d     t0, sp,  0
3430    PUT_8TAP_8BPC_LSX 2
3431    addi.d   sp, sp,  16
3432endfunc
3433
3434function put_8tap_regular_smooth_8bpc_lsx
3435    addi.d   sp, sp,  -16
3436    li.w     t0, 4
3437    st.d     t0, sp,  0
3438    PUT_8TAP_8BPC_LSX 4
3439    addi.d   sp, sp,  16
3440endfunc
3441
3442function put_8tap_smooth_8bpc_lsx
3443    addi.d   sp, sp,  -16
3444    li.w     t0, 5
3445    st.d     t0, sp,  0
3446    PUT_8TAP_8BPC_LSX 5
3447    addi.d   sp, sp,  16
3448endfunc
3449
3450function put_8tap_sharp_smooth_8bpc_lsx
3451    addi.d   sp, sp,  -16
3452    li.w     t0, 6
3453    st.d     t0, sp,  0
3454    PUT_8TAP_8BPC_LSX 6
3455    addi.d   sp, sp,  16
3456endfunc
3457
3458function put_8tap_regular_sharp_8bpc_lsx
3459    addi.d   sp, sp,  -16
3460    li.w     t0, 8
3461    st.d     t0, sp,  0
3462    PUT_8TAP_8BPC_LSX 8
3463    addi.d   sp, sp,  16
3464endfunc
3465
3466function put_8tap_smooth_sharp_8bpc_lsx
3467    addi.d   sp, sp,  -16
3468    li.w     t0, 9
3469    st.d     t0, sp,  0
3470    PUT_8TAP_8BPC_LSX 9
3471    addi.d   sp, sp,  16
3472endfunc
3473
3474function put_8tap_sharp_8bpc_lsx
3475    addi.d   sp, sp,  -16
3476    li.w     t0, 10
3477    st.d     t0, sp,  0
3478    PUT_8TAP_8BPC_LSX 10
3479    addi.d   sp, sp,  16
3480endfunc
3481
3482const shufb1
3483.byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8
3484endconst
3485
3486.macro PREP_H_8W in0
3487    vshuf.b          vr2,    \in0,  \in0,   vr6
3488    vshuf.b          vr3,    \in0,  \in0,   vr7
3489    vshuf.b          vr4,    \in0,  \in0,   vr8
3490    vmulwev.h.bu.b   vr12,   vr2,   vr22
3491    vmulwev.h.bu.b   vr13,   vr3,   vr23
3492    vmulwev.h.bu.b   vr14,   vr3,   vr22
3493    vmulwev.h.bu.b   vr15,   vr4,   vr23
3494    vmaddwod.h.bu.b  vr12,   vr2,   vr22
3495    vmaddwod.h.bu.b  vr13,   vr3,   vr23
3496    vmaddwod.h.bu.b  vr14,   vr3,   vr22
3497    vmaddwod.h.bu.b  vr15,   vr4,   vr23
3498    vadd.h           vr12,   vr12,  vr13
3499    vadd.h           vr14,   vr14,  vr15
3500    vhaddw.w.h       vr12,   vr12,  vr12
3501    vhaddw.w.h       \in0,   vr14,  vr14
3502    vssrarni.h.w     \in0,   vr12,  2
3503.endm
3504
3505.macro PREP_HV_8W_LASX in0
3506    xvshuf.b         xr4,   \in0,  \in0,   xr19
3507    xvshuf.b         xr5,   \in0,  \in0,   xr20
3508    xvshuf.b         xr6,   \in0,  \in0,   xr21
3509    xvmulwev.h.bu.b  xr7,   xr4,   xr22
3510    xvmulwev.h.bu.b  xr9,   xr5,   xr23
3511    xvmulwev.h.bu.b  xr10,  xr5,   xr22
3512    xvmulwev.h.bu.b  xr11,  xr6,   xr23
3513    xvmaddwod.h.bu.b xr7,   xr4,   xr22
3514    xvmaddwod.h.bu.b xr9,   xr5,   xr23
3515    xvmaddwod.h.bu.b xr10,  xr5,   xr22
3516    xvmaddwod.h.bu.b xr11,  xr6,   xr23
3517    xvadd.h          xr7,   xr7,   xr9
3518    xvadd.h          xr9,   xr10,  xr11
3519    xvhaddw.w.h      xr7,   xr7,   xr7
3520    xvhaddw.w.h      \in0,  xr9,   xr9
3521    xvssrarni.h.w    \in0,  xr7,   2
3522.endm
3523
3524.macro PREP_8TAP_8BPC_LASX lable
3525    li.w             t0,     4
3526    la.local         t6,     dav1d_mc_subpel_filters
3527    slli.d           t2,     a2,    1  //src_stride*2
3528    add.d            t3,     t2,    a2 //src_stride*3
3529    slli.d           t4,     t2,    1
3530
3531    bnez             a5,     .l_\lable\()h_lasx //mx
3532    bnez             a6,     .l_\lable\()v_lasx
3533
3534    clz.w            t1,     a3
3535    li.w             t5,     24
3536    sub.w            t1,     t1,    t5
3537    la.local         t5,     .l_\lable\()prep_hv0_jtable_lasx
3538    alsl.d           t1,     t1,    t5,   1
3539    ld.h             t8,     t1,    0
3540    add.d            t5,     t5,    t8
3541    jirl             $r0,    t5,    0
3542
3543    .align   3
3544.l_\lable\()prep_hv0_jtable_lasx:
3545    .hword .l_\lable\()hv0_128w_lasx - .l_\lable\()prep_hv0_jtable_lasx
3546    .hword .l_\lable\()hv0_64w_lasx  - .l_\lable\()prep_hv0_jtable_lasx
3547    .hword .l_\lable\()hv0_32w_lasx  - .l_\lable\()prep_hv0_jtable_lasx
3548    .hword .l_\lable\()hv0_16w_lasx  - .l_\lable\()prep_hv0_jtable_lasx
3549    .hword .l_\lable\()hv0_8w_lasx   - .l_\lable\()prep_hv0_jtable_lasx
3550    .hword .l_\lable\()hv0_4w_lasx   - .l_\lable\()prep_hv0_jtable_lasx
3551
3552.l_\lable\()hv0_4w_lasx:
3553    fld.s            f0,     a1,    0
3554    fldx.s           f1,     a1,    a2
3555    fldx.s           f2,     a1,    t2
3556    fldx.s           f3,     a1,    t3
3557    add.d            a1,     a1,    t4
3558    xvpackev.w       xr0,    xr1,   xr0
3559    xvpackev.w       xr1,    xr3,   xr2
3560    xvpermi.q        xr0,    xr1,   0x02
3561    xvsllwil.hu.bu   xr0,    xr0,   4
3562    xvst             xr0,    a0,    0
3563    addi.d           a0,     a0,    32
3564    addi.d           a4,     a4,    -4
3565    bnez             a4,     .l_\lable\()hv0_4w_lasx
3566    b                .l_\lable\()end_pre_8tap_lasx
3567.l_\lable\()hv0_8w_lasx:
3568    fld.d            f0,     a1,    0
3569    fldx.d           f1,     a1,    a2
3570    fldx.d           f2,     a1,    t2
3571    fldx.d           f3,     a1,    t3
3572    add.d            a1,     a1,    t4
3573    xvpermi.q        xr0,    xr1,   0x02
3574    xvpermi.q        xr2,    xr3,   0x02
3575    xvsllwil.hu.bu   xr0,    xr0,   4
3576    xvsllwil.hu.bu   xr2,    xr2,   4
3577    xvst             xr0,    a0,    0
3578    xvst             xr2,    a0,    32
3579    addi.d           a0,     a0,    64
3580    addi.d           a4,     a4,    -4
3581    bnez             a4,     .l_\lable\()hv0_8w_lasx
3582    b                .l_\lable\()end_pre_8tap_lasx
3583.l_\lable\()hv0_16w_lasx:
3584    vld              vr0,    a1,    0
3585    vldx             vr1,    a1,    a2
3586    vldx             vr2,    a1,    t2
3587    vldx             vr3,    a1,    t3
3588    add.d            a1,     a1,    t4
3589    vext2xv.hu.bu    xr0,    xr0
3590    vext2xv.hu.bu    xr1,    xr1
3591    vext2xv.hu.bu    xr2,    xr2
3592    vext2xv.hu.bu    xr3,    xr3
3593    xvslli.h         xr0,    xr0,   4
3594    xvslli.h         xr1,    xr1,   4
3595    xvslli.h         xr2,    xr2,   4
3596    xvslli.h         xr3,    xr3,   4
3597    xvst             xr0,    a0,    0
3598    xvst             xr1,    a0,    32
3599    xvst             xr2,    a0,    64
3600    xvst             xr3,    a0,    96
3601    addi.d           a0,     a0,    128
3602    addi.d           a4,     a4,    -4
3603    bnez             a4,     .l_\lable\()hv0_16w_lasx
3604    b                .l_\lable\()end_pre_8tap_lasx
3605.l_\lable\()hv0_32w_lasx:
3606    xvld             xr0,    a1,    0
3607    xvldx            xr1,    a1,    a2
3608    xvldx            xr2,    a1,    t2
3609    xvldx            xr3,    a1,    t3
3610    add.d            a1,     a1,    t4
3611    xvpermi.d        xr4,    xr0,   0xD8
3612    xvpermi.d        xr5,    xr1,   0xD8
3613    xvpermi.d        xr6,    xr2,   0xD8
3614    xvpermi.d        xr7,    xr3,   0xD8
3615    xvpermi.d        xr10,   xr0,   0x32
3616    xvpermi.d        xr11,   xr1,   0x32
3617    xvpermi.d        xr12,   xr2,   0x32
3618    xvpermi.d        xr13,   xr3,   0x32
3619    xvsllwil.hu.bu   xr0,    xr4,   4
3620    xvsllwil.hu.bu   xr1,    xr5,   4
3621    xvsllwil.hu.bu   xr2,    xr6,   4
3622    xvsllwil.hu.bu   xr3,    xr7,   4
3623    xvsllwil.hu.bu   xr4,    xr10,  4
3624    xvsllwil.hu.bu   xr5,    xr11,  4
3625    xvsllwil.hu.bu   xr6,    xr12,  4
3626    xvsllwil.hu.bu   xr7,    xr13,  4
3627    xvst             xr0,    a0,    0
3628    xvst             xr4,    a0,    32
3629    xvst             xr1,    a0,    64
3630    xvst             xr5,    a0,    96
3631    xvst             xr2,    a0,    128
3632    xvst             xr6,    a0,    160
3633    xvst             xr3,    a0,    192
3634    xvst             xr7,    a0,    224
3635    addi.d           a0,     a0,    256
3636    addi.d           a4,     a4,    -4
3637    bnez             a4,     .l_\lable\()hv0_32w_lasx
3638    b                .l_\lable\()end_pre_8tap_lasx
3639.l_\lable\()hv0_64w_lasx:
3640.l_\lable\()hv0_128w_lasx:
3641    addi.d           t0,     a1,    0
3642    addi.d           t5,     a4,    0
3643    srli.w           t7,     a3,    5
3644    slli.w           t7,     t7,    6
3645    addi.d           t8,     a0,    0
3646.l_\lable\()hv0_32_loop_lasx:
3647    xvld             xr0,    a1,    0
3648    xvldx            xr1,    a1,    a2
3649    xvldx            xr2,    a1,    t2
3650    xvldx            xr3,    a1,    t3
3651    add.d            a1,     a1,    t4
3652    xvpermi.d        xr4,    xr0,   0xD8
3653    xvpermi.d        xr5,    xr1,   0xD8
3654    xvpermi.d        xr6,    xr2,   0xD8
3655    xvpermi.d        xr7,    xr3,   0xD8
3656    xvpermi.d        xr10,   xr0,   0x32
3657    xvpermi.d        xr11,   xr1,   0x32
3658    xvpermi.d        xr12,   xr2,   0x32
3659    xvpermi.d        xr13,   xr3,   0x32
3660    xvsllwil.hu.bu   xr0,    xr4,   4
3661    xvsllwil.hu.bu   xr1,    xr5,   4
3662    xvsllwil.hu.bu   xr2,    xr6,   4
3663    xvsllwil.hu.bu   xr3,    xr7,   4
3664    xvsllwil.hu.bu   xr4,    xr10,  4
3665    xvsllwil.hu.bu   xr5,    xr11,  4
3666    xvsllwil.hu.bu   xr6,    xr12,  4
3667    xvsllwil.hu.bu   xr7,    xr13,  4
3668    xvst             xr0,    a0,    0
3669    xvst             xr4,    a0,    32
3670    add.d            t1,     a0,    t7
3671    xvst             xr1,    t1,    0
3672    xvst             xr5,    t1,    32
3673    add.d            t1,     t1,    t7
3674    xvst             xr2,    t1,    0
3675    xvst             xr6,    t1,    32
3676    add.d            t1,     t1,    t7
3677    xvst             xr3,    t1,    0
3678    xvst             xr7,    t1,    32
3679    add.d            a0,     t1,    t7
3680    addi.d           a4,     a4,   -4
3681    bnez             a4,     .l_\lable\()hv0_32_loop_lasx
3682    addi.d           a1,     t0,    32
3683    addi.d           t0,     t0,    32
3684    addi.d           a0,     t8,    64
3685    addi.d           t8,     t8,    64
3686    addi.d           a4,     t5,    0
3687    addi.d           a3,     a3,   -32
3688    bnez             a3,     .l_\lable\()hv0_32_loop_lasx
3689    b                .l_\lable\()end_pre_8tap_lasx
3690
3691.l_\lable\()h_lasx:
3692    bnez             a6,     .l_\lable\()hv_lasx //if(fh) && if (fv)
3693
3694    andi             t1,    a7,    3
3695    blt              t0,    a3,    .l_\lable\()h_idx_fh_lasx
3696    andi             t1,    a7,    1
3697    addi.w           t1,    t1,    3
3698.l_\lable\()h_idx_fh_lasx:
3699    addi.w           t5,    zero,  120
3700    mul.w            t1,    t1,    t5
3701    addi.w           t5,    a5,    -1
3702    slli.w           t5,    t5,    3
3703    add.w            t1,    t1,    t5
3704    add.d            t1,    t6,    t1 //fh's offset
3705    xvldrepl.d       xr22,  t1,    0
3706
3707    addi.d           a1,    a1,    -3
3708    clz.w            t1,    a3
3709    li.w             t5,    24
3710    sub.w            t1,    t1,    t5
3711    la.local         t5,    .l_\lable\()prep_h_jtable_lasx
3712    alsl.d           t1,    t1,    t5,   1
3713    ld.h             t8,    t1,    0
3714    add.d            t5,    t5,    t8
3715    jirl             $r0,   t5,    0
3716
3717    .align   3
3718.l_\lable\()prep_h_jtable_lasx:
3719    .hword .l_\lable\()h_128w_lasx - .l_\lable\()prep_h_jtable_lasx
3720    .hword .l_\lable\()h_64w_lasx  - .l_\lable\()prep_h_jtable_lasx
3721    .hword .l_\lable\()h_32w_lasx  - .l_\lable\()prep_h_jtable_lasx
3722    .hword .l_\lable\()h_16w_lasx  - .l_\lable\()prep_h_jtable_lasx
3723    .hword .l_\lable\()h_8w_lasx   - .l_\lable\()prep_h_jtable_lasx
3724    .hword .l_\lable\()h_4w_lasx   - .l_\lable\()prep_h_jtable_lasx
3725
3726.l_\lable\()h_4w_lasx:
3727    addi.d           a1,    a1,    2
3728    la.local         t7,    subpel_h_shuf1
3729    vld              vr7,   t7,    0
3730    xvreplve0.q      xr7,   xr7
3731    xvbsrl.v         xr22,  xr22,  2
3732    xvreplve0.w      xr22,  xr22
3733.l_\lable\()h_4w_loop_lasx:
3734    vld              vr0,   a1,    0
3735    vldx             vr1,   a1,    a2
3736    vldx             vr2,   a1,    t2
3737    vldx             vr3,   a1,    t3
3738    add.d            a1,    a1,    t4
3739    xvpermi.q        xr1,   xr0,   0x20
3740    xvpermi.q        xr3,   xr2,   0x20
3741    xvshuf.b         xr1,   xr1,   xr1,   xr7
3742    xvshuf.b         xr3,   xr3,   xr3,   xr7
3743    xvmulwev.h.bu.b  xr0,   xr1,   xr22
3744    xvmulwev.h.bu.b  xr2,   xr3,   xr22
3745    xvmaddwod.h.bu.b xr0,   xr1,   xr22
3746    xvmaddwod.h.bu.b xr2,   xr3,   xr22
3747    xvhaddw.w.h      xr0,   xr0,   xr0
3748    xvhaddw.w.h      xr2,   xr2,   xr2
3749    xvssrarni.h.w    xr2,   xr0,   2
3750    xvpermi.d        xr2,   xr2,   0xd8
3751    xvst             xr2,   a0,    0
3752    addi.d           a0,    a0,    32
3753    addi.w           a4,    a4,    -4
3754    bnez             a4,    .l_\lable\()h_4w_loop_lasx
3755    b                .l_\lable\()end_pre_8tap_lasx
3756
3757.l_\lable\()h_8w_lasx:
3758    la.local         t7,    subpel_h_shuf1
3759    vld              vr6,   t7,    0
3760    vbsrl.v          vr23,  vr22,  4 //fh
3761    xvreplve0.w      xr23,  xr23
3762    xvreplve0.w      xr22,  xr22
3763    xvreplve0.q      xr19,  xr6
3764    xvaddi.bu        xr20,  xr19,  4
3765    xvaddi.bu        xr21,  xr19,  8
3766.l_\lable\()h_8w_loop_lasx:
3767    xvld             xr0,   a1,    0
3768    xvldx            xr1,   a1,    a2
3769    add.d            a1,    a1,    t2
3770    xvpermi.q        xr0,   xr1,   0x02
3771    PREP_HV_8W_LASX  xr0
3772    xvst             xr0,   a0,    0
3773    addi.d           a0,    a0,    32
3774    addi.d           a4,    a4,   -2
3775    bnez             a4,    .l_\lable\()h_8w_loop_lasx
3776    b                .l_\lable\()end_pre_8tap_lasx
3777
3778.l_\lable\()h_16w_lasx:
3779    la.local         t7,    subpel_h_shuf1
3780    vld              vr6,   t7,    0
3781    vbsrl.v          vr23,  vr22,  4 //fh
3782    xvreplve0.w      xr23,  xr23
3783    xvreplve0.w      xr22,  xr22
3784    xvreplve0.q      xr19,  xr6
3785    xvaddi.bu        xr20,  xr19,  4
3786    xvaddi.bu        xr21,  xr19,  8
3787.l_\lable\()h_16w_loop_lasx:
3788    xvld             xr0,   a1,    0
3789    xvld             xr1,   a1,    8
3790    add.d            a1,    a1,    a2
3791    xvpermi.q        xr0,   xr1,   0x02
3792    PREP_HV_8W_LASX  xr0
3793    xvst             xr0,   a0,    0
3794    xvld             xr0,   a1,    0
3795    xvld             xr1,   a1,    8
3796    add.d            a1,    a1,    a2
3797    xvpermi.q        xr0,   xr1,   0x02
3798    PREP_HV_8W_LASX  xr0
3799    xvst             xr0,   a0,    32
3800    addi.d           a0,    a0,    64
3801    addi.w           a4,    a4,    -2
3802    bnez             a4,     .l_\lable\()h_16w_loop_lasx
3803    b                .l_\lable\()end_pre_8tap_lasx
3804
3805.l_\lable\()h_32w_lasx:
3806.l_\lable\()h_64w_lasx:
3807.l_\lable\()h_128w_lasx:
3808    la.local         t7,    subpel_h_shuf1
3809    vld              vr6,   t7,    0
3810    vbsrl.v          vr23,  vr22,  4 //fh
3811    xvreplve0.w      xr23,  xr23
3812    xvreplve0.w      xr22,  xr22
3813    xvreplve0.q      xr19,  xr6
3814    xvaddi.bu        xr20,  xr19,  4
3815    xvaddi.bu        xr21,  xr19,  8
3816    addi.d           t5,    a1,    0 //src
3817    addi.d           t6,    a3,    0 //w
3818    slli.w           t7,    a3,    1 //store offset
3819    addi.d           t8,    a0,    0 //dst
3820.l_\lable\()h_16_loop_lasx:
3821    xvld             xr0,   a1,    0
3822    xvld             xr1,   a1,    8
3823    xvpermi.q        xr0,   xr1,   0x02
3824    PREP_HV_8W_LASX  xr0
3825    xvst             xr0,   a0,    0
3826    xvld             xr0,   a1,    16
3827    xvld             xr1,   a1,    24
3828    xvpermi.q        xr0,   xr1,   0x02
3829    PREP_HV_8W_LASX  xr0
3830    xvst             xr0,   a0,    32
3831    addi.d           a0,    a0,    64
3832    addi.d           a1,    a1,    32
3833    addi.d           a3,    a3,   -32
3834    bnez             a3,    .l_\lable\()h_16_loop_lasx
3835    add.d            a1,    t5,    a2
3836    add.d            t5,    t5,    a2
3837    add.d            a0,    t8,    t7
3838    add.d            t8,    t8,    t7
3839    addi.d           a3,    t6,    0
3840    addi.d           a4,    a4,    -1
3841    bnez             a4,    .l_\lable\()h_16_loop_lasx
3842    b                .l_\lable\()end_pre_8tap_lasx
3843
3844.l_\lable\()hv_lasx:
3845    andi             t1,    a7,    3
3846    blt              t0,    a3,    .l_\lable\()hv_idx_fh_lasx
3847    andi             t1,    a7,    1
3848    addi.w           t1,    t1,    3
3849.l_\lable\()hv_idx_fh_lasx:
3850    addi.w           t5,    zero,  120
3851    mul.w            t1,    t1,    t5
3852    addi.w           t5,    a5,    -1
3853    slli.w           t5,    t5,    3
3854    add.w            t1,    t1,    t5
3855    add.d            t1,    t6,    t1 //fh's offset
3856    xvldrepl.d       xr22,  t1,    0
3857    srli.w           a7,    a7,    2
3858    blt              t0,    a4,    .l_\lable\()hv_idx_fv_lasx
3859    andi             a7,    a7,    1
3860    addi.w           a7,    a7,    3
3861.l_\lable\()hv_idx_fv_lasx:
3862    addi.w           t5,    zero,  120
3863    mul.w            a7,    a7,    t5
3864    addi.w           t5,    a6,    -1
3865    slli.w           t5,    t5,    3
3866    add.w            a7,    a7,    t5
3867    add.d            a7,    t6,    a7 //fv's offset
3868    xvldrepl.d       xr8,   a7,    0
3869    xvsllwil.h.b     xr8,   xr8,   0
3870    sub.d            a1,    a1,    t3
3871    addi.d           a1,    a1,    -1 //ignore leading 0s
3872    beq              a3,    t0,    .l_\lable\()hv_4w_lasx
3873    addi.d           a1,    a1,    -2
3874    b                .l_\lable\()hv_8w_lasx
3875.l_\lable\()hv_4w_lasx:
3876    xvld             xr0,   a1,    0
3877    xvldx            xr1,   a1,    a2
3878    xvldx            xr2,   a1,    t2
3879    xvldx            xr3,   a1,    t3
3880    add.d            a1,    a1,    t4
3881    xvld             xr4,   a1,    0
3882    xvldx            xr5,   a1,    a2
3883    xvldx            xr6,   a1,    t2
3884    la.local         t1,    subpel_h_shuf2
3885    xvld             xr7,   t1,    0
3886    vbsrl.v          vr22,  vr22,  2
3887    xvreplve0.w      xr22,  xr22
3888    xvreplve0.q      xr8,   xr8
3889    xvrepl128vei.w   xr12,  xr8,   0
3890    xvrepl128vei.w   xr13,  xr8,   1
3891    xvrepl128vei.w   xr14,  xr8,   2
3892    xvrepl128vei.w   xr15,  xr8,   3
3893    xvilvl.d         xr0,   xr1,   xr0
3894    xvilvl.d         xr2,   xr3,   xr2
3895    xvilvl.d         xr4,   xr5,   xr4
3896    xvreplve0.q      xr0,   xr0
3897    xvreplve0.q      xr2,   xr2
3898    xvreplve0.q      xr4,   xr4
3899    xvreplve0.q      xr6,   xr6
3900    xvshuf.b         xr0,   xr0,   xr0,   xr7
3901    xvshuf.b         xr2,   xr2,   xr2,   xr7
3902    xvshuf.b         xr4,   xr4,   xr4,   xr7
3903    xvshuf.b         xr6,   xr6,   xr6,   xr7
3904    xvmulwev.h.bu.b  xr1,   xr0,   xr22
3905    xvmulwev.h.bu.b  xr3,   xr2,   xr22
3906    xvmulwev.h.bu.b  xr5,   xr4,   xr22
3907    xvmulwev.h.bu.b  xr9,   xr6,   xr22
3908    xvmaddwod.h.bu.b xr1,   xr0,   xr22
3909    xvmaddwod.h.bu.b xr3,   xr2,   xr22
3910    xvmaddwod.h.bu.b xr5,   xr4,   xr22
3911    xvmaddwod.h.bu.b xr9,   xr6,   xr22
3912    xvhaddw.w.h      xr1,   xr1,   xr1  // a0 b0 a1 b1  c0 d0 c1 d1
3913    xvhaddw.w.h      xr3,   xr3,   xr3  // a2 b2 a3 b3  c2 d2 c3 d3
3914    xvhaddw.w.h      xr5,   xr5,   xr5  // a4 b4 a5 b5  c4 d4 c5 d5
3915    xvhaddw.w.h      xr9,   xr9,   xr9  // a6 b6 -  -   c6 d6 -  -
3916    xvssrarni.h.w    xr3,   xr1,   2    // a0 b0 a1 b1  a2 b2 a3 b3  c0 d0 c1 d1  c2 d2 c3 d3
3917    xvssrarni.h.w    xr9,   xr5,   2    // a4 b4 a5 b5  a6 b6 -  -   c4 d4 c5 d5  c6 d6 -  -
3918    xvbsrl.v         xr4,   xr3,   4
3919    xvextrins.w      xr4,   xr9,   0x30 // a1 b1 a2 b2  a3 b3 a4 b4  c1 d1 c2 d2  c3 d3 c4 d4
3920    xvilvl.h         xr5,   xr4,   xr3  // a0 a1 b0 b1  a1 a2 b1 b2  c0 c1 d0 d1  c1 c2 d1 d2
3921    xvilvh.h         xr6,   xr4,   xr3  // a2 a3 b2 b3  a3 a4 b3 b4  c2 c3 d2 d3  c3 c4 d3 d4
3922    xvbsrl.v         xr10,  xr9,   4    // a5 b5 a6 b6  -  -  -  -   c5 d5 c6 d6  -  -  -  -
3923    xvilvl.h         xr11,  xr10,  xr9  // a4 a5 b4 b5  a5 a6 b5 b6  c4 c5 d4 d5  c5 c6 d5 d6
3924.l_\lable\()hv_w4_loop_lasx:
3925    xvmulwev.w.h     xr16,  xr5,   xr12 //a0 a1 (h0)
3926    xvmulwev.w.h     xr17,  xr6,   xr12 //a2 a3 (h1)
3927    xvmulwev.w.h     xr18,  xr6,   xr13 //a2 a3 (h0)
3928    xvmulwev.w.h     xr19,  xr11,  xr13 //a4 a5 (h1)
3929    xvmulwev.w.h     xr20,  xr11,  xr14 //a4 a5 (h0)
3930    xvmaddwod.w.h    xr16,  xr5,   xr12 //
3931    xvmaddwod.w.h    xr17,  xr6,   xr12 //
3932    xvmaddwod.w.h    xr18,  xr6,   xr13 //
3933    xvmaddwod.w.h    xr19,  xr11,  xr13 //
3934    xvmaddwod.w.h    xr20,  xr11,  xr14 //
3935    xvaddi.wu        xr5,   xr11,   0
3936    xvadd.w          xr16,  xr16,  xr18 //a0 a1 + a2 a3
3937    xvldx            xr18,  a1,    t3   //a7 b7 c7 d7
3938    add.d            a1,    a1,    t4
3939    xvadd.w          xr17,  xr17,  xr19 //a2 a3 + a4 a5
3940    xvld             xr19,  a1,    0    //a8 b8 c8 d8
3941    xvadd.w          xr16,  xr16,  xr20 //a0 a1 + a2 a3 + a4 a5
3942    xvldx            xr20,  a1,    a2   //a9 b9 c9 d9
3943    xvilvl.d         xr18,  xr19,  xr18
3944    xvreplve0.q      xr18,  xr18
3945    xvldx            xr19,  a1,    t2   //aa ba ca da
3946    xvilvl.d         xr20,  xr19,  xr20
3947    xvreplve0.q      xr20,  xr20
3948    xvshuf.b         xr18,  xr18,  xr18,  xr7
3949    xvshuf.b         xr20,  xr20,  xr20,  xr7
3950    xvmulwev.h.bu.b  xr21,  xr18,  xr22
3951    xvmulwev.h.bu.b  xr23,  xr20,  xr22
3952    xvmaddwod.h.bu.b xr21,  xr18,  xr22
3953    xvmaddwod.h.bu.b xr23,  xr20,  xr22
3954    xvhaddw.w.h      xr21,  xr21,  xr21 //a7 b7 a8 b8 c7 d7 c8 d8
3955    xvhaddw.w.h      xr23,  xr23,  xr23 //a9 b9 aa ba c9 d9 ca da
3956    xvssrarni.h.w    xr23,  xr21,  2    //a7 b7 a8 b8  a9 b9 aa ba  c7 d7 c8 d8  c9 d9 ca da
3957    xvbsll.v         xr0,   xr23,  4
3958    xvextrins.w      xr0,   xr9,   0x02 //a6 b6 a7 b7  a8 b8 a9 b9  c6 d6 c7 d7  c8 d8 c9 d9
3959    xvilvl.h         xr6,   xr23,  xr0  //a6 a7 b6 b7  a7 a8 b7 b8  c6 c7 d6 d7  c7 c8 d7 d8
3960    xvilvh.h         xr11,  xr23,  xr0  //a8 a9 b8 b9  a9 aa b9 ba  c8 c9 d8 d9  c9 ca d9 da
3961    xvbsrl.v         xr9,   xr23,  4
3962    xvmulwev.w.h     xr1 ,  xr6,   xr14 //a6 a7 (h0)
3963    xvmulwev.w.h     xr2 ,  xr6,   xr15 //a6 a7 (h1)
3964    xvmulwev.w.h     xr3 ,  xr11,  xr15 //a8 a9 (h1)
3965    xvmaddwod.w.h    xr1 ,  xr6,   xr14
3966    xvmaddwod.w.h    xr2 ,  xr6,   xr15
3967    xvmaddwod.w.h    xr3 ,  xr11,  xr15
3968    xvadd.w          xr17,  xr17,  xr1  //a2 a3 + a4 a5 + a6 a7
3969    xvadd.w          xr16,  xr16,  xr2  //a0 a1 + a2 a3 + a4 a5 + a6 a7
3970    xvadd.w          xr17,  xr17,  xr3  //a2 a3 + a4 a5 + a6 a7 + a8 a9
3971    xvssrarni.h.w    xr17,  xr16,  6    //a01 b01 a12 b12  a23 b23 a34 b34  c01 d01 c12 d12  c23 d23 c34 d34
3972    xvpermi.d        xr17,  xr17,  0xd8 //a01 b01 a12 b12  c01 d01 c12 d12  a23 b23 a34 b34  c23 d23 c34 d34
3973    xvshuf4i.w       xr17,  xr17,  0xd8
3974    xvst             xr17,  a0,    0
3975    addi.d           a0,    a0,    32
3976    addi.d           a4,    a4,    -4
3977    bnez             a4,    .l_\lable\()hv_w4_loop_lasx
3978    b                .l_\lable\()end_pre_8tap_lasx
3979
3980.l_\lable\()hv_8w_lasx:
3981    addi.d           sp,    sp,   -4*8
3982    fst.d            f24,   sp,    0
3983    fst.d            f25,   sp,    8
3984    fst.d            f26,   sp,    16
3985    fst.d            f27,   sp,    24
3986    la.local         t1,    subpel_h_shuf1
3987    vld              vr19,  t1,    0
3988    addi.d           t0,    a1,    0
3989    addi.d           t5,    a4,    0
3990    slli.w           t7,    a3,    1 // store offset
3991    addi.d           t8,    a0,    0
3992    xvreplve0.q      xr19,  xr19
3993    xvaddi.bu        xr20,  xr19,  4
3994    xvaddi.bu        xr21,  xr19,  8
3995    vbsrl.v          vr23,  vr22,  4
3996    xvreplve0.w      xr22,  xr22 //f0f1f2f3
3997    xvreplve0.w      xr23,  xr23 //f4f5f6f7
3998    xvreplve0.q      xr8,   xr8
3999    xvrepl128vei.w   xr24,  xr8,   0
4000    xvrepl128vei.w   xr25,  xr8,   1
4001    xvrepl128vei.w   xr26,  xr8,   2
4002    xvrepl128vei.w   xr27,  xr8,   3
4003.l_\lable\()hv_8w_loop0_lasx:
4004    xvld             xr0,   a1,    0
4005    xvldx            xr1,   a1,    a2
4006    xvldx            xr2,   a1,    t2
4007    add.d            a1,    a1,    t3
4008    xvld             xr3,   a1,    0
4009    xvldx            xr4,   a1,    a2
4010    xvldx            xr5,   a1,    t2
4011    xvldx            xr6,   a1,    t3
4012    add.d            a1,    a1,    t4
4013    xvpermi.q        xr0,   xr3,   0x02 //0 3
4014    xvpermi.q        xr1,   xr4,   0x02 //1 4
4015    xvpermi.q        xr2,   xr5,   0x02 //2 5
4016    xvpermi.q        xr3,   xr6,   0x02 //3 6
4017    PREP_HV_8W_LASX  xr0 //a0b0c0d0 e0f0g0h0 a3b3c3d3 e3f3g3h3
4018    PREP_HV_8W_LASX  xr1 //a1b1c1d1 e1f1g1h1 a4b4c4d4 e4f4g4h4
4019    PREP_HV_8W_LASX  xr2 //a2b2c2d2 e2f2g2h2 a5b5c5d5 e5f5g5h5
4020    PREP_HV_8W_LASX  xr3 //a3b3c3d3 e3f3g3h3 a6b6c6d6 e6f6g6h6
4021    xvpermi.d        xr0,   xr0,   0xd8
4022    xvpermi.d        xr1,   xr1,   0xd8
4023    xvpermi.d        xr2,   xr2,   0xd8
4024    xvpermi.d        xr18,  xr3,   0xd8
4025    xvilvl.h         xr12,  xr1,   xr0 //a0a1b0b1c0c1d0d1 e0e1f0f1g0g1h0h1
4026    xvilvh.h         xr13,  xr1,   xr0 //a3a4b3b4c3c4d3d4 e3e4f3f4g3g4h3h4
4027    xvilvl.h         xr14,  xr2,   xr1 //a1a2b1b2c1c2d1d2 e1e2f1f2g1g2h1h2
4028    xvilvh.h         xr15,  xr2,   xr1 //a4a5b4b5c4c5d4d5 e4e5f4f5g4g5h4h5
4029    xvilvl.h         xr16,  xr18,  xr2 //a2a3b2b3c2c3d2d3 e2e3f2f3g2g3h2h3
4030    xvilvh.h         xr17,  xr18,  xr2 //a5a6b5b6c5c6d5d6 e5e6f5f6g5g6h5h6
4031.l_\lable\()hv_8w_loop_lasx:
4032    xvld             xr0,   a1,    0
4033    xvldx            xr1,   a1,    a2
4034    add.d            a1,    a1,    t2
4035    xvpermi.q        xr0,   xr1,   0x02 //7 8
4036    PREP_HV_8W_LASX  xr0                //a7b7c7d7e7f7g7h7 a8b8c8d8e8f8g8h8
4037    xvpermi.q        xr3,   xr0,   0x03 //a6b6c6d6e6f6g6h6 a7b7c7d7e7f7g7h7
4038    xvpermi.d        xr3,   xr3,   0xd8 //a6b6c6d6a7b7c7d7 e6f6g6h6e7f7g7h7
4039    xvpermi.d        xr1,   xr0,   0xd8 //a7b7c7d7a8b8c8d8 e7f7g7h7e8f8g8h8
4040    xvilvl.h         xr18,  xr1,   xr3  //a6a7b6b7c6c7d6d7 e6e7f6f7g6g7h6h7
4041    xvilvh.h         xr2,   xr1,   xr3  //a7a8b7b8c7c8d7d8 e7e8f7f8g7g8h7h8
4042    xvaddi.hu        xr3,   xr0,   0
4043    xvmulwev.w.h     xr4,   xr12,  xr24 //01
4044    xvmulwev.w.h     xr5,   xr14,  xr24 //12
4045    xvmulwev.w.h     xr6,   xr16,  xr25 //23
4046    xvmulwev.w.h     xr7,   xr13,  xr25 //34
4047    xvmulwev.w.h     xr8,   xr15,  xr26 //45
4048    xvmulwev.w.h     xr9,   xr17,  xr26 //56
4049    xvmulwev.w.h     xr10,  xr18,  xr27 //67
4050    xvmulwev.w.h     xr11,  xr2,   xr27 //78
4051    xvmaddwod.w.h    xr4,   xr12,  xr24 //01
4052    xvmaddwod.w.h    xr5,   xr14,  xr24 //12
4053    xvmaddwod.w.h    xr6,   xr16,  xr25 //23
4054    xvmaddwod.w.h    xr7,   xr13,  xr25 //34
4055    xvmaddwod.w.h    xr8,   xr15,  xr26 //45
4056    xvmaddwod.w.h    xr9,   xr17,  xr26 //56
4057    xvmaddwod.w.h    xr10,  xr18,  xr27 //67
4058    xvmaddwod.w.h    xr11,  xr2,   xr27 //78
4059    xvadd.w          xr4,   xr4,   xr6
4060    xvadd.w          xr5,   xr5,   xr7
4061    xvadd.w          xr4,   xr4,   xr8
4062    xvadd.w          xr5,   xr5,   xr9
4063    xvadd.w          xr4,   xr4,   xr10
4064    xvadd.w          xr5,   xr5,   xr11
4065    xvaddi.hu        xr12,  xr16,  0 //01 <-- 23
4066    xvaddi.hu        xr14,  xr13,  0 //12 <-- 34
4067    xvaddi.hu        xr16,  xr15,  0 //23 <-- 45
4068    xvaddi.hu        xr13,  xr17,  0 //34 <-- 56
4069    xvaddi.hu        xr15,  xr18,  0 //45 <-- 67
4070    xvaddi.hu        xr17,  xr2,   0 //56 <-- 78
4071    xvssrarni.h.w    xr5,   xr4,   6
4072    xvpermi.d        xr5,   xr5,   0xd8
4073    vst              vr5,   a0,    0
4074    xvpermi.q        xr5,   xr5,   0x11
4075    vstx             vr5,   a0,    t7
4076    alsl.d           a0,    t7,    a0,  1
4077    addi.d           a4,    a4,   -2
4078    bnez             a4,    .l_\lable\()hv_8w_loop_lasx
4079    addi.d           a1,    t0,    8
4080    addi.d           t0,    t0,    8
4081    addi.d           a0,    t8,    16
4082    addi.d           t8,    t8,    16
4083    addi.d           a4,    t5,    0
4084    addi.d           a3,    a3,   -8
4085    bnez             a3,    .l_\lable\()hv_8w_loop0_lasx
4086    fld.d            f24,   sp,    0
4087    fld.d            f25,   sp,    8
4088    fld.d            f26,   sp,    16
4089    fld.d            f27,   sp,    24
4090    addi.d           sp,    sp,    4*8
4091    b                .l_\lable\()end_pre_8tap_lasx
4092
4093.l_\lable\()v_lasx:
4094    srli.w           a7,    a7,    2
4095    blt              t0,    a4,    .l_\lable\()v_idx_fv_lasx
4096    andi             a7,    a7,    1
4097    addi.w           a7,    a7,    3
4098.l_\lable\()v_idx_fv_lasx:
4099    addi.w           t5,    zero,  120
4100    mul.w            a7,    a7,    t5
4101    addi.w           t5,    a6,    -1
4102    slli.w           t5,    t5,    3
4103    add.w            a7,    a7,    t5
4104    add.d            a7,    t6,    a7 //fv's offset
4105    xvldrepl.d       xr8,   a7,    0
4106    xvrepl128vei.h   xr12,  xr8,   0
4107    xvrepl128vei.h   xr13,  xr8,   1
4108    xvrepl128vei.h   xr14,  xr8,   2
4109    xvrepl128vei.h   xr15,  xr8,   3
4110    sub.d            a1,    a1,    t3
4111    beq              a3,    t0,    .l_\lable\()v_4w_lasx
4112    addi.w           t0,    t0,    4
4113    beq              a3,    t0,    .l_\lable\()v_8w_lasx
4114    blt              t0,    a3,    .l_\lable\()v_16w_lasx
4115.l_\lable\()v_4w_lasx:
4116    la.local         t6,    subpel_h_shuf3
4117    xvld             xr11,  t6,    0
4118    fld.s            f0,    a1,    0   //a0b0c0d0
4119    fldx.s           f1,    a1,    a2  //a1b1c1d1
4120    fldx.s           f2,    a1,    t2  //a2b2c2d2
4121    add.d            a1,    a1,    t3
4122    fld.s            f3,    a1,    0   //a3b3c3d3
4123    fldx.s           f4,    a1,    a2  //a4b4c4d4
4124    fldx.s           f5,    a1,    t2  //a5b5c5d5
4125    fldx.s           f6,    a1,    t3  //a6b6c6d6
4126    vilvl.w          vr0,   vr1,   vr0 //01
4127    vilvl.w          vr1,   vr3,   vr2 //23
4128    vilvl.d          vr0,   vr1,   vr0 //0123
4129    vilvl.w          vr2,   vr5,   vr4 //45
4130    vilvl.d          vr1,   vr2,   vr1 //2345
4131    xvpermi.q        xr0,   xr1,   0x02 //0123 2345
4132    xvbsrl.v         xr1,   xr0,   4    //123- 345-
4133    xvpermi.q        xr4,   xr6,   0x02
4134    xvextrins.w      xr1,   xr4,   0x30 //1234 3456
4135    xvilvl.b         xr2,   xr1,   xr0  //0112 2334         //a0a1b0b1c0c1d0d1 a1a2b1b2c1c2d1d2 a2a3b2b3c2c3d2d3 a3a4b3b4c3c4d3d4
4136    xvilvh.b         xr3,   xr1,   xr0  //2334 4556         //a2a3b2b3c2c3d2d3 a3a4b3b4c3c4d3d4 a4a5b4b5c4c5d4d5 a5a6b5b6c5c6d5d6
4137.l_\lable\()v_4w_loop_lasx:
4138    add.d            a1,    a1,    t4
4139    fld.s            f0,    a1,    0  //a7b7c7d7
4140    fldx.s           f1,    a1,    a2 //a8b8c8d8
4141    fldx.s           f4,    a1,    t2 //a9b9c9d9
4142    fldx.s           f5,    a1,    t3 //aabacada
4143    vilvl.w          vr7,   vr0,   vr6 //67
4144    vilvl.w          vr10,  vr4,   vr1 //89
4145    vextrins.w       vr7,   vr1,   0x20//678-
4146    vextrins.w       vr10,  vr5,   0x20//89a-
4147    xvpermi.q        xr7,   xr10,  0x02//678- 89a-
4148    xvshuf.b         xr4,   xr7,   xr7,  xr11 //67 78 89 9a //a6a7b6b7c6c7d6d7 a7a8b7b8c7c8d7d8 a8a9b8b9c8c9d8d9 a9aab9bac9cad9da
4149    xvpermi.q        xr7,   xr3,   0x11 //4556
4150    xvpermi.q        xr7,   xr4,   0x02 //45 56 67 78       //a4a5b4b5c4c5d4d5 a5a6b5b6c5c6d5d6 a6a7b6b7c6c7d6d7 a7a8b7b8c7c8d7d8
4151    xvmulwev.h.bu.b  xr16,  xr2,   xr12
4152    xvmulwev.h.bu.b  xr17,  xr3,   xr13
4153    xvmulwev.h.bu.b  xr18,  xr7,   xr14
4154    xvmulwev.h.bu.b  xr19,  xr4,   xr15
4155    xvmaddwod.h.bu.b xr16,  xr2,   xr12
4156    xvmaddwod.h.bu.b xr17,  xr3,   xr13
4157    xvmaddwod.h.bu.b xr18,  xr7,   xr14
4158    xvmaddwod.h.bu.b xr19,  xr4,   xr15
4159    xvadd.h          xr16,  xr16,  xr17
4160    xvadd.h          xr16,  xr16,  xr18
4161    xvadd.h          xr16,  xr16,  xr19
4162    xvsrari.h        xr16,  xr16,  2
4163    xvaddi.bu        xr2,   xr7,   0
4164    xvaddi.bu        xr3,   xr4,   0
4165    xvaddi.bu        xr6,   xr5,   0
4166    xvst             xr16,  a0,    0
4167    addi.d           a0,    a0,    32
4168    addi.w           a4,    a4,   -4
4169    bnez             a4,    .l_\lable\()v_4w_loop_lasx
4170    b                .l_\lable\()end_pre_8tap_lasx
4171
4172.l_\lable\()v_8w_lasx:
4173    fld.d            f0,    a1,    0
4174    fldx.d           f1,    a1,    a2
4175    fldx.d           f2,    a1,    t2
4176    add.d            a1,    a1,    t3
4177    fld.d            f3,    a1,    0
4178    fldx.d           f4,    a1,    a2
4179    fldx.d           f5,    a1,    t2
4180    fldx.d           f6,    a1,    t3
4181    xvpermi.q        xr0,   xr1,   0x02
4182    xvpermi.q        xr1,   xr2,   0x02
4183    xvilvl.b         xr0,   xr1,   xr0 //01 12
4184    xvpermi.q        xr2,   xr3,   0x02
4185    xvpermi.q        xr3,   xr4,   0x02
4186    xvilvl.b         xr2,   xr3,   xr2 //23 34
4187    xvpermi.q        xr4,   xr5,   0x02
4188    xvpermi.q        xr5,   xr6,   0x02
4189    xvilvl.b         xr4,   xr5,   xr4 //45 56
4190.l_\lable\()v_8w_loop_lasx:
4191    add.d            a1,    a1,    t4
4192    fld.d            f7,    a1,    0   //7
4193    fldx.d           f10,   a1,    a2  //8
4194    fldx.d           f11,   a1,    t2  //9
4195    fldx.d           f18,   a1,    t3  //a
4196    xvpermi.q        xr6,   xr7,   0x02
4197    xvpermi.q        xr7,   xr10,  0x02
4198    xvilvl.b         xr6,   xr7,   xr6  //67 78
4199    xvpermi.q        xr10,  xr11,  0x02
4200    xvpermi.q        xr11,  xr18,  0x02
4201    xvilvl.b         xr10,  xr11,  xr10 //89 9a
4202    xvmulwev.h.bu.b  xr1,   xr0,   xr12
4203    xvmulwev.h.bu.b  xr3,   xr2,   xr13
4204    xvmulwev.h.bu.b  xr5,   xr4,   xr14
4205    xvmulwev.h.bu.b  xr7,   xr6,   xr15
4206    xvmulwev.h.bu.b  xr9,   xr2,   xr12
4207    xvmulwev.h.bu.b  xr11,  xr4,   xr13
4208    xvmulwev.h.bu.b  xr16,  xr6,   xr14
4209    xvmulwev.h.bu.b  xr17,  xr10,  xr15
4210    xvmaddwod.h.bu.b xr1,   xr0,   xr12
4211    xvmaddwod.h.bu.b xr3,   xr2,   xr13
4212    xvmaddwod.h.bu.b xr5,   xr4,   xr14
4213    xvmaddwod.h.bu.b xr7,   xr6,   xr15
4214    xvmaddwod.h.bu.b xr9,   xr2,   xr12
4215    xvmaddwod.h.bu.b xr11,  xr4,   xr13
4216    xvmaddwod.h.bu.b xr16,  xr6,   xr14
4217    xvmaddwod.h.bu.b xr17,  xr10,  xr15
4218    xvadd.h          xr1,   xr1,   xr3
4219    xvadd.h          xr1,   xr1,   xr5
4220    xvadd.h          xr1,   xr1,   xr7
4221    xvadd.h          xr9,   xr9,   xr11
4222    xvadd.h          xr9,   xr9,   xr16
4223    xvadd.h          xr9,   xr9,   xr17
4224    xvaddi.bu        xr0,   xr4,   0
4225    xvaddi.bu        xr2,   xr6,   0
4226    xvaddi.bu        xr4,   xr10,  0
4227    xvaddi.bu        xr6,   xr18,  0
4228    xvsrari.h        xr1,   xr1,   2
4229    xvsrari.h        xr9,   xr9,   2
4230    xvst             xr1,   a0,    0
4231    xvst             xr9,   a0,    32
4232    addi.d           a0,    a0,    64
4233    addi.w           a4,    a4,   -4
4234    bnez             a4,    .l_\lable\()v_8w_loop_lasx
4235    b                .l_\lable\()end_pre_8tap_lasx
4236
4237.l_\lable\()v_16w_lasx:
4238    addi.d           t0,    a0,    0 //dst
4239    addi.d           t5,    a1,    0 //src
4240    slli.w           t7,    a3,    1 //w
4241    addi.d           t8,    a4,    0 //h
4242.l_\lable\()v_16w_loop0_lasx:
4243    vld              vr0,   a1,    0
4244    vldx             vr1,   a1,    a2
4245    vldx             vr2,   a1,    t2
4246    add.d            a1,    a1,    t3
4247    vld              vr3,   a1,    0
4248    vldx             vr4,   a1,    a2
4249    vldx             vr5,   a1,    t2
4250    vldx             vr6,   a1,    t3
4251    add.d            a1,    a1,    t4
4252    xvpermi.d        xr0,   xr0,   0xd8
4253    xvpermi.d        xr1,   xr1,   0xd8
4254    xvpermi.d        xr2,   xr2,   0xd8
4255    xvpermi.d        xr3,   xr3,   0xd8
4256    xvpermi.d        xr4,   xr4,   0xd8
4257    xvpermi.d        xr5,   xr5,   0xd8
4258    xvpermi.d        xr6,   xr6,   0xd8
4259    xvilvl.b         xr0,   xr1,   xr0 //01
4260    xvilvl.b         xr1,   xr2,   xr1 //12
4261    xvilvl.b         xr2,   xr3,   xr2 //23
4262    xvilvl.b         xr3,   xr4,   xr3 //34
4263    xvilvl.b         xr4,   xr5,   xr4 //45
4264    xvilvl.b         xr5,   xr6,   xr5 //56
4265.l_\lable\()v_16w_loop_lasx:
4266    vld              vr7,   a1,    0   //7
4267    vldx             vr10,  a1,    a2  //8
4268    add.d            a1,    a1,    t2
4269    xvpermi.d        xr7,   xr7,   0xd8
4270    xvpermi.d        xr10,  xr10,  0xd8
4271    xvilvl.b         xr6,   xr7,   xr6 //67
4272    xvilvl.b         xr7,   xr10,  xr7 //78
4273    xvmulwev.h.bu.b  xr9,   xr0,   xr12
4274    xvmulwev.h.bu.b  xr11,  xr2,   xr13
4275    xvmulwev.h.bu.b  xr16,  xr4,   xr14
4276    xvmulwev.h.bu.b  xr17,  xr6,   xr15
4277    xvmulwev.h.bu.b  xr18,  xr1,   xr12
4278    xvmulwev.h.bu.b  xr19,  xr3,   xr13
4279    xvmulwev.h.bu.b  xr20,  xr5,   xr14
4280    xvmulwev.h.bu.b  xr21,  xr7,   xr15
4281    xvmaddwod.h.bu.b xr9,   xr0,   xr12
4282    xvmaddwod.h.bu.b xr11,  xr2,   xr13
4283    xvmaddwod.h.bu.b xr16,  xr4,   xr14
4284    xvmaddwod.h.bu.b xr17,  xr6,   xr15
4285    xvmaddwod.h.bu.b xr18,  xr1,   xr12
4286    xvmaddwod.h.bu.b xr19,  xr3,   xr13
4287    xvmaddwod.h.bu.b xr20,  xr5,   xr14
4288    xvmaddwod.h.bu.b xr21,  xr7,   xr15
4289    xvadd.h          xr9,   xr9,   xr11
4290    xvadd.h          xr9,   xr9,   xr16
4291    xvadd.h          xr9,   xr9,   xr17
4292    xvadd.h          xr11,  xr18,  xr19
4293    xvadd.h          xr11,  xr11,  xr20
4294    xvadd.h          xr11,  xr11,  xr21
4295    xvsrari.h        xr9,   xr9,   2
4296    xvsrari.h        xr11,  xr11,  2
4297    xvaddi.bu        xr0,   xr2,   0
4298    xvaddi.bu        xr1,   xr3,   0
4299    xvaddi.bu        xr2,   xr4,   0
4300    xvaddi.bu        xr3,   xr5,   0
4301    xvaddi.bu        xr4,   xr6,   0
4302    xvaddi.bu        xr5,   xr7,   0
4303    xvaddi.bu        xr6,   xr10,  0
4304    xvst             xr9,   a0,    0
4305    xvstx            xr11,  a0,    t7
4306    alsl.d           a0,    t7,    a0,  1
4307    addi.d           a4,    a4,   -2
4308    bnez             a4,    .l_\lable\()v_16w_loop_lasx
4309    addi.d           a3,    a3,   -16
4310    addi.d           a0,    t0,    32
4311    addi.d           t0,    t0,    32
4312    addi.d           a1,    t5,    16
4313    addi.d           t5,    t5,    16
4314    addi.d           a4,    t8,    0
4315    bnez             a3,    .l_\lable\()v_16w_loop0_lasx
4316.l_\lable\()end_pre_8tap_lasx:
4317.endm
4318
4319function prep_8tap_regular_8bpc_lasx
4320    addi.w a7, zero, 0
4321    PREP_8TAP_8BPC_LASX 0
4322endfunc
4323
4324function prep_8tap_smooth_regular_8bpc_lasx
4325    addi.w a7, zero, 1
4326    PREP_8TAP_8BPC_LASX 1
4327endfunc
4328
4329function prep_8tap_sharp_regular_8bpc_lasx
4330    addi.w a7, zero, 2
4331    PREP_8TAP_8BPC_LASX 2
4332endfunc
4333
4334function prep_8tap_regular_smooth_8bpc_lasx
4335    addi.w a7, zero, 4
4336    PREP_8TAP_8BPC_LASX 4
4337endfunc
4338
4339function prep_8tap_smooth_8bpc_lasx
4340    addi.w a7, zero, 5
4341    PREP_8TAP_8BPC_LASX 5
4342endfunc
4343
4344function prep_8tap_sharp_smooth_8bpc_lasx
4345    addi.w a7, zero, 6
4346    PREP_8TAP_8BPC_LASX 6
4347endfunc
4348
4349function prep_8tap_regular_sharp_8bpc_lasx
4350    addi.w a7, zero, 8
4351    PREP_8TAP_8BPC_LASX 8
4352endfunc
4353
4354function prep_8tap_smooth_sharp_8bpc_lasx
4355    addi.w a7, zero, 9
4356    PREP_8TAP_8BPC_LASX 9
4357endfunc
4358
4359function prep_8tap_sharp_8bpc_lasx
4360    addi.w a7, zero, 10
4361    PREP_8TAP_8BPC_LASX 10
4362endfunc
4363
4364.macro PREP_8TAP_8BPC_LSX lable
4365    li.w             t0,     4
4366    la.local         t6,     dav1d_mc_subpel_filters
4367    la.local         t7,     shufb1
4368    vld              vr23,   t7,    0
4369    slli.d           t2,     a2,    1  //src_stride*2
4370    add.d            t3,     t2,    a2 //src_stride*3
4371    slli.d           t4,     t2,    1
4372
4373    bnez             a5,     .l_\lable\()h_lsx //mx
4374    bnez             a6,     .l_\lable\()v_lsx
4375
4376    clz.w            t1,     a3
4377    li.w             t5,     24
4378    sub.w            t1,     t1,    t5
4379    la.local         t5,     .l_\lable\()prep_hv0_jtable_lsx
4380    alsl.d           t1,     t1,    t5,   1
4381    ld.h             t8,     t1,    0
4382    add.d            t5,     t5,    t8
4383    jirl             $r0,    t5,    0
4384    .align   3
4385.l_\lable\()prep_hv0_jtable_lsx:
4386    .hword .l_\lable\()hv0_128w_lsx - .l_\lable\()prep_hv0_jtable_lsx
4387    .hword .l_\lable\()hv0_64w_lsx  - .l_\lable\()prep_hv0_jtable_lsx
4388    .hword .l_\lable\()hv0_32w_lsx  - .l_\lable\()prep_hv0_jtable_lsx
4389    .hword .l_\lable\()hv0_16w_lsx  - .l_\lable\()prep_hv0_jtable_lsx
4390    .hword .l_\lable\()hv0_8w_lsx   - .l_\lable\()prep_hv0_jtable_lsx
4391    .hword .l_\lable\()hv0_4w_lsx   - .l_\lable\()prep_hv0_jtable_lsx
4392
4393.l_\lable\()hv0_4w_lsx:
4394    fld.s            f0,     a1,    0
4395    fldx.s           f1,     a1,    a2
4396    add.d            a1,     a1,    t2
4397    vilvl.w          vr0,    vr1,   vr0
4398    vsllwil.hu.bu    vr0,    vr0,   4
4399    vst              vr0,    a0,    0
4400    addi.d           a0,     a0,    16
4401    addi.d           a4,     a4,    -2
4402    bnez             a4,     .l_\lable\()hv0_4w_lsx
4403    b                .l_\lable\()end_pre_8tap_lsx
4404.l_\lable\()hv0_8w_lsx:
4405    fld.d            f0,     a1,    0
4406    fldx.d           f1,     a1,    a2
4407    add.d            a1,     a1,    t2
4408    vsllwil.hu.bu    vr0,    vr0,   4
4409    vsllwil.hu.bu    vr1,    vr1,   4
4410    vst              vr0,    a0,    0
4411    vst              vr1,    a0,    16
4412    addi.d           a0,     a0,    32
4413    addi.d           a4,     a4,    -2
4414    bnez             a4,     .l_\lable\()hv0_8w_lsx
4415    b                .l_\lable\()end_pre_8tap_lsx
4416.l_\lable\()hv0_16w_lsx:
4417    vld              vr0,    a1,    0
4418    vldx             vr1,    a1,    a2
4419    add.d            a1,     a1,    t2
4420    vsllwil.hu.bu    vr2,    vr0,   4
4421    vsllwil.hu.bu    vr4,    vr1,   4
4422    vexth.hu.bu      vr3,    vr0
4423    vexth.hu.bu      vr5,    vr1
4424    vslli.h          vr3,    vr3,   4
4425    vslli.h          vr5,    vr5,   4
4426    vst              vr2,    a0,    0
4427    vst              vr3,    a0,    16
4428    vst              vr4,    a0,    32
4429    vst              vr5,    a0,    48
4430    addi.d           a0,     a0,    64
4431    addi.d           a4,     a4,    -2
4432    bnez             a4,     .l_\lable\()hv0_16w_lsx
4433    b                .l_\lable\()end_pre_8tap_lsx
4434.l_\lable\()hv0_32w_lsx:
4435.l_\lable\()hv0_64w_lsx:
4436.l_\lable\()hv0_128w_lsx:
4437    addi.d           t0,     a1,    0
4438    addi.d           t5,     a4,    0
4439    srli.w           t7,     a3,    4
4440    slli.w           t7,     t7,    5
4441    addi.d           t8,     a0,    0
4442.l_\lable\()hv0_16_loop_lsx:
4443    vld              vr0,    a1,    0
4444    vldx             vr1,    a1,    a2
4445    add.d            a1,     a1,    t2
4446    vsllwil.hu.bu    vr2,    vr0,   4
4447    vsllwil.hu.bu    vr3,    vr1,   4
4448    vexth.hu.bu      vr0,    vr0
4449    vexth.hu.bu      vr1,    vr1
4450    vslli.h          vr0,    vr0,   4
4451    vslli.h          vr1,    vr1,   4
4452    vst              vr2,    a0,    0
4453    vst              vr0,    a0,    16
4454    add.d            a0,     a0,    t7
4455    vst              vr3,    a0,    0
4456    vst              vr1,    a0,    16
4457    add.d            a0,     a0,    t7
4458    addi.d           a4,     a4,    -2
4459    bnez             a4,     .l_\lable\()hv0_16_loop_lsx
4460    addi.d           a1,     t0,    16
4461    addi.d           t0,     t0,    16
4462    addi.d           a0,     t8,    32
4463    addi.d           t8,     t8,    32
4464    addi.d           a4,     t5,    0
4465    addi.d           a3,     a3,    -16
4466    bnez             a3,     .l_\lable\()hv0_16_loop_lsx
4467    b                .l_\lable\()end_pre_8tap_lsx
4468.l_\lable\()h_lsx:
4469    bnez             a6,     .l_\lable\()hv_lsx //if(fh) && if (fv)
4470
4471    andi             t1,     a7,    3
4472    blt              t0,     a3,    .l_\lable\()h_idx_fh_lsx
4473    andi             t1,     a7,    1
4474    addi.w           t1,     t1,    3
4475.l_\lable\()h_idx_fh_lsx:
4476    addi.w           t5,     zero,  120
4477    mul.w            t1,     t1,    t5
4478    addi.w           t5,     a5,    -1
4479    slli.w           t5,     t5,    3
4480    add.w            t1,     t1,    t5
4481    add.d            t1,     t6,    t1 //fh's offset
4482    vldrepl.d        vr23,   t1,    0
4483
4484    addi.d           a1,     a1,    -3
4485    clz.w            t1,     a3
4486    li.w             t5,     24
4487    sub.w            t1,     t1,    t5
4488    la.local         t5,     .l_\lable\()prep_h_jtable_lsx
4489    alsl.d           t1,     t1,    t5,   1
4490    ld.h             t8,     t1,    0
4491    add.d            t5,     t5,    t8
4492    jirl             $r0,    t5,    0
4493
4494    .align   3
4495.l_\lable\()prep_h_jtable_lsx:
4496    .hword .l_\lable\()h_128w_lsx - .l_\lable\()prep_h_jtable_lsx
4497    .hword .l_\lable\()h_64w_lsx  - .l_\lable\()prep_h_jtable_lsx
4498    .hword .l_\lable\()h_32w_lsx  - .l_\lable\()prep_h_jtable_lsx
4499    .hword .l_\lable\()h_16w_lsx  - .l_\lable\()prep_h_jtable_lsx
4500    .hword .l_\lable\()h_8w_lsx   - .l_\lable\()prep_h_jtable_lsx
4501    .hword .l_\lable\()h_4w_lsx   - .l_\lable\()prep_h_jtable_lsx
4502
4503.l_\lable\()h_4w_lsx:
4504    addi.d           a1,     a1,    2
4505    la.local         t7,     subpel_h_shuf1
4506    vld              vr7,    t7,    0
4507    vbsrl.v          vr23,   vr23,  2
4508    vreplvei.w       vr23,   vr23,  0
4509.l_\lable\()h_4w_loop_lsx:
4510    vld              vr0,    a1,    0
4511    vldx             vr1,    a1,    a2
4512    add.d            a1,     a1,    t2
4513    vshuf.b          vr0,    vr0,   vr0,   vr7
4514    vshuf.b          vr1,    vr1,   vr1,   vr7
4515    vmulwev.h.bu.b   vr2,    vr0,   vr23
4516    vmulwev.h.bu.b   vr3,    vr1,   vr23
4517    vmaddwod.h.bu.b  vr2,    vr0,   vr23
4518    vmaddwod.h.bu.b  vr3,    vr1,   vr23
4519    vhaddw.w.h       vr0,    vr2,   vr2
4520    vhaddw.w.h       vr1,    vr3,   vr3
4521    vssrarni.h.w     vr1,    vr0,   2
4522    vst              vr1,    a0,    0
4523    addi.d           a0,     a0,    16
4524    addi.w           a4,     a4,    -2
4525    bnez             a4,     .l_\lable\()h_4w_loop_lsx
4526    b                .l_\lable\()end_pre_8tap_lsx
4527
4528.l_\lable\()h_8w_lsx:
4529    vreplvei.w       vr22,   vr23,  0 //fh
4530    vreplvei.w       vr23,   vr23,  1
4531    la.local         t7,     subpel_h_shuf1
4532    vld              vr6,    t7,    0
4533    vaddi.bu         vr7,    vr6,   4
4534    vaddi.bu         vr8,    vr6,   8
4535.l_\lable\()h_8w_loop_lsx:
4536    vld              vr0,    a1,    0
4537    vldx             vr1,    a1,    a2
4538    add.d            a1,     a1,    t2
4539    PREP_H_8W        vr0
4540    PREP_H_8W        vr1
4541    vst              vr0,    a0,    0
4542    vst              vr1,    a0,    16
4543    addi.d           a0,     a0,    32
4544    addi.d           a4,     a4,    -2
4545    bnez             a4,     .l_\lable\()h_8w_loop_lsx
4546    b                .l_\lable\()end_pre_8tap_lsx
4547
4548.l_\lable\()h_16w_lsx:
4549.l_\lable\()h_32w_lsx:
4550.l_\lable\()h_64w_lsx:
4551.l_\lable\()h_128w_lsx:
4552    vreplvei.w       vr22,   vr23,  0 //fh
4553    vreplvei.w       vr23,   vr23,  1
4554    la.local         t7,     subpel_h_shuf1
4555    vld              vr6,    t7,    0
4556    vaddi.bu         vr7,    vr6,   4
4557    vaddi.bu         vr8,    vr6,   8
4558    srli.w           t7,     a3,    4
4559    slli.w           t6,     t7,    5
4560.l_\lable\()h_16w_loop0_lsx:
4561    addi.d           t0,     a1,    0 //src
4562    addi.d           t5,     a4,    0 //h
4563    addi.d           t8,     a0,    0 //dst
4564.l_\lable\()h_16w_loop_lsx:
4565    vld              vr0,    a1,    0
4566    vld              vr1,    a1,    8
4567    add.d            a1,     a1,    a2
4568    PREP_H_8W        vr0
4569    PREP_H_8W        vr1
4570    vst              vr0,    a0,    0
4571    vst              vr1,    a0,    16
4572    add.d            a0,     a0,    t6
4573    addi.d           t5,     t5,    -1
4574    bnez             t5,     .l_\lable\()h_16w_loop_lsx
4575    addi.d           a1,     t0,    16
4576    addi.d           a0,     t8,    32
4577    addi.w           t7,     t7,    -1
4578    bnez             t7,     .l_\lable\()h_16w_loop0_lsx
4579    b                .l_\lable\()end_pre_8tap_lsx
4580
4581.l_\lable\()hv_lsx:
4582    andi             t1,     a7,    3
4583    blt              t0,     a3,    .l_\lable\()hv_idx_fh_lsx
4584    andi             t1,     a7,    1
4585    addi.w           t1,     t1,    3
4586.l_\lable\()hv_idx_fh_lsx:
4587    addi.w           t5,     zero,  120
4588    mul.w            t1,     t1,    t5
4589    addi.w           t5,     a5,    -1
4590    slli.w           t5,     t5,    3
4591    add.w            t1,     t1,    t5
4592    add.d            t1,     t6,    t1 //fh's offset
4593    vldrepl.d        vr8,    t1,    0
4594    srli.w           a7,     a7,    2
4595    blt              t0,     a4,    .l_\lable\()hv_idx_fv_lsx
4596    andi             a7,     a7,    1
4597    addi.w           a7,     a7,    3
4598.l_\lable\()hv_idx_fv_lsx:
4599    addi.w           t5,     zero,  120
4600    mul.w            a7,     a7,    t5
4601    addi.w           t5,     a6,    -1
4602    slli.w           t5,     t5,    3
4603    add.w            a7,     a7,    t5
4604    add.d            a7,     t6,    a7 //fv's offset
4605    vldrepl.d        vr9,    a7,    0
4606    vsllwil.h.b      vr9,    vr9,   0
4607
4608    sub.d            a1,     a1,    t3
4609    addi.d           a1,     a1,    -3
4610    beq              a3,     t0,    .l_\lable\()hv_4w_lsx
4611    b                .l_\lable\()hv_8w_lsx
4612.l_\lable\()hv_4w_lsx:
4613    addi.d           a1,     a1,    2 //ignore leading 0s
4614    vld              vr0,    a1,    0
4615    vldx             vr1,    a1,    a2
4616    vldx             vr2,    a1,    t2
4617    add.d            a1,     a1,    t3
4618    vld              vr3,    a1,    0
4619    vldx             vr4,    a1,    a2
4620    vldx             vr5,    a1,    t2
4621    vldx             vr6,    a1,    t3
4622    add.d            a1,     a1,    t4
4623
4624    la.local         t1,     subpel_h_shuf1
4625    vld              vr7,    t1,    0
4626    vbsrl.v          vr8,    vr8,   2
4627    vreplvei.w       vr8,    vr8,   0
4628
4629    //fv
4630    vreplvei.w       vr17,   vr9,   0
4631    vreplvei.w       vr18,   vr9,   1
4632    vreplvei.w       vr19,   vr9,   2
4633    vreplvei.w       vr20,   vr9,   3
4634
4635    //DAV1D_FILTER_8TAP_RND
4636    vshuf.b          vr0,    vr0,   vr0,  vr7
4637    vshuf.b          vr1,    vr1,   vr1,  vr7
4638    vshuf.b          vr2,    vr2,   vr2,  vr7
4639    vshuf.b          vr3,    vr3,   vr3,  vr7
4640    vshuf.b          vr4,    vr4,   vr4,  vr7
4641    vshuf.b          vr5,    vr5,   vr5,  vr7
4642    vshuf.b          vr6,    vr6,   vr6,  vr7
4643
4644    vmulwev.h.bu.b   vr10,   vr0,   vr8
4645    vmulwev.h.bu.b   vr11,   vr1,   vr8
4646    vmulwev.h.bu.b   vr12,   vr2,   vr8
4647    vmulwev.h.bu.b   vr13,   vr3,   vr8
4648    vmulwev.h.bu.b   vr14,   vr4,   vr8
4649    vmulwev.h.bu.b   vr15,   vr5,   vr8
4650    vmulwev.h.bu.b   vr16,   vr6,   vr8
4651    vmaddwod.h.bu.b  vr10,   vr0,   vr8
4652    vmaddwod.h.bu.b  vr11,   vr1,   vr8
4653    vmaddwod.h.bu.b  vr12,   vr2,   vr8
4654    vmaddwod.h.bu.b  vr13,   vr3,   vr8
4655    vmaddwod.h.bu.b  vr14,   vr4,   vr8
4656    vmaddwod.h.bu.b  vr15,   vr5,   vr8
4657    vmaddwod.h.bu.b  vr16,   vr6,   vr8
4658
4659    vhaddw.w.h       vr10,   vr10,  vr10
4660    vhaddw.w.h       vr11,   vr11,  vr11
4661    vhaddw.w.h       vr12,   vr12,  vr12
4662    vhaddw.w.h       vr13,   vr13,  vr13
4663    vhaddw.w.h       vr14,   vr14,  vr14
4664    vhaddw.w.h       vr15,   vr15,  vr15
4665    vhaddw.w.h       vr16,   vr16,  vr16
4666
4667    vssrarni.h.w     vr10,   vr10,  2 //h0
4668    vssrarni.h.w     vr11,   vr11,  2 //h1
4669    vssrarni.h.w     vr12,   vr12,  2 //h2
4670    vssrarni.h.w     vr13,   vr13,  2 //h3
4671    vssrarni.h.w     vr14,   vr14,  2 //h4
4672    vssrarni.h.w     vr15,   vr15,  2 //h5
4673    vssrarni.h.w     vr16,   vr16,  2 //h6
4674
4675    //h0
4676    vilvl.h          vr0,    vr11,  vr10 //01
4677    vilvl.h          vr1,    vr13,  vr12 //23
4678    vilvl.h          vr2,    vr15,  vr14 //45
4679    //h1
4680    vilvl.h          vr4,    vr12,  vr11 //12
4681    vilvl.h          vr5,    vr14,  vr13 //34
4682    vilvl.h          vr6,    vr16,  vr15 //56
4683
4684.l_\lable\()hv_w4_loop_lsx:
4685    vld              vr9,    a1,    0
4686    vldx             vr10,   a1,    a2
4687    add.d            a1,     a1,    t2
4688
4689    //DAV1D_FILTER_8TAP_CLIP
4690    vshuf.b          vr9,    vr9,   vr9,  vr7
4691    vshuf.b          vr10,   vr10,  vr10, vr7
4692    vmulwev.h.bu.b   vr11,   vr9,   vr8
4693    vmulwev.h.bu.b   vr12,   vr10,  vr8
4694    vmaddwod.h.bu.b  vr11,   vr9,   vr8
4695    vmaddwod.h.bu.b  vr12,   vr10,  vr8
4696    vhaddw.w.h       vr11,   vr11,  vr11
4697    vhaddw.w.h       vr12,   vr12,  vr12
4698    vssrarni.h.w     vr11,   vr11,  2 //7h
4699    vssrarni.h.w     vr12,   vr12,  2 //h8
4700    vilvl.h          vr3,    vr11,  vr16 //67
4701    vilvl.h          vr13,   vr12,  vr11 //78
4702
4703    vmulwev.w.h      vr9,    vr0,   vr17
4704    vmulwev.w.h      vr10,   vr1,   vr18
4705    vmulwev.w.h      vr14,   vr2,   vr19
4706    vmulwev.w.h      vr15,   vr3,   vr20
4707    vmaddwod.w.h     vr9,    vr0,   vr17
4708    vmaddwod.w.h     vr10,   vr1,   vr18
4709    vmaddwod.w.h     vr14,   vr2,   vr19
4710    vmaddwod.w.h     vr15,   vr3,   vr20
4711    vadd.w           vr16,   vr9,   vr10
4712    vadd.w           vr16,   vr16,  vr14
4713    vadd.w           vr16,   vr16,  vr15
4714
4715    vmulwev.w.h      vr9,    vr4,   vr17
4716    vmulwev.w.h      vr10,   vr5,   vr18
4717    vmulwev.w.h      vr14,   vr6,   vr19
4718    vmulwev.w.h      vr15,   vr13,  vr20
4719    vmaddwod.w.h     vr9,    vr4,   vr17
4720    vmaddwod.w.h     vr10,   vr5,   vr18
4721    vmaddwod.w.h     vr14,   vr6,   vr19
4722    vmaddwod.w.h     vr15,   vr13,  vr20
4723    vadd.w           vr21,   vr9,   vr10
4724    vadd.w           vr21,   vr21,  vr14
4725    vadd.w           vr21,   vr21,  vr15
4726
4727    vssrarni.h.w     vr21,   vr16,  6
4728    //cache
4729    vaddi.hu         vr0,    vr1,   0
4730    vaddi.hu         vr1,    vr2,   0
4731    vaddi.hu         vr2,    vr3,   0
4732    vaddi.hu         vr4,    vr5,   0
4733    vaddi.hu         vr5,    vr6,   0
4734    vaddi.hu         vr6,    vr13,  0
4735    vaddi.hu         vr16,   vr12,  0
4736
4737    vst              vr21,   a0,    0
4738    addi.d           a0,     a0,    16
4739    addi.d           a4,     a4,    -2
4740    bnez             a4,     .l_\lable\()hv_w4_loop_lsx
4741    b                .l_\lable\()end_pre_8tap_lsx
4742
4743.l_\lable\()hv_8w_lsx:
4744.l_\lable\()hv_16w_lsx:
4745.l_\lable\()hv_32w_lsx:
4746.l_\lable\()hv_64w_lsx:
4747.l_\lable\()hv_128w_lsx:
4748    addi.d          sp,      sp,    -8*8
4749    fst.d           f24,     sp,    0
4750    fst.d           f25,     sp,    8
4751    fst.d           f26,     sp,    16
4752    fst.d           f27,     sp,    24
4753    fst.d           f28,     sp,    32
4754    fst.d           f29,     sp,    40
4755    fst.d           f30,     sp,    48
4756    fst.d           f31,     sp,    56
4757    addi.d          t0,      a1,    0 //src
4758    addi.d          t5,      a4,    0 //h
4759    addi.d          t8,      a0,    0 //dst
4760    slli.w          t6,      a3,    1
4761    la.local        t1,      subpel_h_shuf1
4762    vld             vr7,     t1,    0
4763    vaddi.bu        vr11,    vr7,   4
4764    vaddi.bu        vr12,    vr7,   8
4765    vreplvei.w      vr10,    vr8,   1
4766    vreplvei.w      vr8,     vr8,   0
4767    vreplvei.w      vr20,    vr9,   1
4768    vreplvei.w      vr21,    vr9,   2
4769    vreplvei.w      vr22,    vr9,   3
4770    vreplvei.w      vr9,     vr9,   0
4771.l_\lable\()prep_hv_8w_loop0_lsx:
4772    vld             vr0,     a1,    0
4773    vldx            vr1,     a1,    a2
4774    vldx            vr2,     a1,    t2
4775    add.d           a1,      a1,    t3
4776    vld             vr3,     a1,    0
4777    vldx            vr4,     a1,    a2
4778    vldx            vr5,     a1,    t2
4779    vldx            vr6,     a1,    t3
4780    add.d           a1,      a1,    t4
4781
4782    FILTER_8TAP_8W  vr0 //h0
4783    FILTER_8TAP_8W  vr1 //h1
4784    FILTER_8TAP_8W  vr2 //h2
4785    FILTER_8TAP_8W  vr3 //h3
4786    FILTER_8TAP_8W  vr4 //h4
4787    FILTER_8TAP_8W  vr5 //h5
4788    FILTER_8TAP_8W  vr6 //h6
4789
4790    //h0' low part
4791    vilvl.h         vr23,    vr1,   vr0 //01
4792    vilvl.h         vr24,    vr3,   vr2 //23
4793    vilvl.h         vr25,    vr5,   vr4 //45
4794    //h0' high part
4795    vilvh.h         vr26,    vr1,   vr0 //01
4796    vilvh.h         vr27,    vr3,   vr2 //23
4797    vilvh.h         vr28,    vr5,   vr4 //45
4798
4799    //h1' low part
4800    vilvl.h         vr29,    vr2,   vr1 //12
4801    vilvl.h         vr30,    vr4,   vr3 //34
4802    vilvl.h         vr31,    vr6,   vr5 //56
4803    //h1' high part
4804    vilvh.h         vr0,     vr2,   vr1 //12
4805    vilvh.h         vr1,     vr4,   vr3 //34
4806    vilvh.h         vr2,     vr6,   vr5 //56
4807
4808.l_\lable\()prep_hv_8w_loop_lsx:
4809    vld             vr3,     a1,    0
4810    vldx            vr4,     a1,    a2
4811    add.d           a1,      a1,    t2
4812
4813    FILTER_8TAP_8W  vr3 //h7
4814    FILTER_8TAP_8W  vr4 //h8
4815
4816    //h0' low part
4817    vilvl.h         vr16,    vr3,   vr6 //67 ~low
4818    vmulwev.w.h     vr13,    vr23,  vr9
4819    vmulwev.w.h     vr14,    vr24,  vr20
4820    vmulwev.w.h     vr15,    vr25,  vr21
4821    vmulwev.w.h     vr17,    vr16,  vr22
4822    vmaddwod.w.h    vr13,    vr23,  vr9
4823    vmaddwod.w.h    vr14,    vr24,  vr20
4824    vmaddwod.w.h    vr15,    vr25,  vr21
4825    vmaddwod.w.h    vr17,    vr16,  vr22
4826    vadd.w          vr13,    vr13,  vr14
4827    vadd.w          vr13,    vr13,  vr15
4828    vadd.w          vr13,    vr13,  vr17
4829    //cache
4830    vaddi.hu        vr23,    vr24,  0
4831    vaddi.hu        vr24,    vr25,  0
4832    vaddi.hu        vr25,    vr16,  0
4833
4834    //h0' high part
4835    vilvh.h         vr17,    vr3,   vr6 //67 ~high
4836    vmulwev.w.h     vr14,    vr26,  vr9
4837    vmulwev.w.h     vr15,    vr27,  vr20
4838    vmulwev.w.h     vr16,    vr28,  vr21
4839    vmulwev.w.h     vr18,    vr17,  vr22
4840    vmaddwod.w.h    vr14,    vr26,  vr9
4841    vmaddwod.w.h    vr15,    vr27,  vr20
4842    vmaddwod.w.h    vr16,    vr28,  vr21
4843    vmaddwod.w.h    vr18,    vr17,  vr22
4844    vadd.w          vr14,    vr14,  vr15
4845    vadd.w          vr14,    vr14,  vr16
4846    vadd.w          vr14,    vr14,  vr18
4847    vssrarni.h.w    vr14,    vr13,  6
4848    vst             vr14,    a0,    0
4849    add.d           a0,      a0,    t6
4850    //cache
4851    vaddi.hu        vr26,    vr27,  0
4852    vaddi.hu        vr27,    vr28,  0
4853    vaddi.hu        vr28,    vr17,  0
4854    vaddi.hu        vr6,     vr4,   0
4855
4856    vilvl.h         vr5,     vr4,   vr3 //78 ~low
4857    vilvh.h         vr4,     vr4,   vr3 //78 ~high
4858
4859    //h1' low part
4860    vmulwev.w.h     vr13,    vr29,  vr9
4861    vmulwev.w.h     vr14,    vr30,  vr20
4862    vmulwev.w.h     vr15,    vr31,  vr21
4863    vmulwev.w.h     vr16,    vr5,   vr22
4864    vmaddwod.w.h    vr13,    vr29,  vr9
4865    vmaddwod.w.h    vr14,    vr30,  vr20
4866    vmaddwod.w.h    vr15,    vr31,  vr21
4867    vmaddwod.w.h    vr16,    vr5,   vr22
4868    vadd.w          vr13,    vr13,  vr14
4869    vadd.w          vr13,    vr13,  vr15
4870    vadd.w          vr13,    vr13,  vr16
4871    //cache
4872    vaddi.hu        vr29,    vr30,  0
4873    vaddi.hu        vr30,    vr31,  0
4874    vaddi.hu        vr31,    vr5,   0
4875
4876    //h1' high part
4877    vmulwev.w.h     vr14,    vr0,   vr9
4878    vmulwev.w.h     vr15,    vr1,   vr20
4879    vmulwev.w.h     vr16,    vr2,   vr21
4880    vmulwev.w.h     vr17,    vr4,   vr22
4881    vmaddwod.w.h    vr14,    vr0,   vr9
4882    vmaddwod.w.h    vr15,    vr1,   vr20
4883    vmaddwod.w.h    vr16,    vr2,   vr21
4884    vmaddwod.w.h    vr17,    vr4,   vr22
4885    vadd.w          vr14,    vr14,  vr15
4886    vadd.w          vr14,    vr14,  vr16
4887    vadd.w          vr14,    vr14,  vr17
4888    vssrarni.h.w    vr14,    vr13,  6
4889    vst             vr14,    a0,    0
4890    add.d           a0,      a0,    t6
4891    //cache
4892    vaddi.hu        vr0,     vr1,   0
4893    vaddi.hu        vr1,     vr2,   0
4894    vaddi.hu        vr2,     vr4,   0
4895    addi.w          a4,      a4,    -2
4896    bnez            a4,      .l_\lable\()prep_hv_8w_loop_lsx
4897    addi.d          a1,      t0,    8
4898    addi.d          t0,      t0,    8
4899    addi.d          a0,      t8,    16
4900    addi.d          t8,      t8,    16
4901    addi.d          a4,      t5,    0
4902    addi.w          a3,      a3,    -8
4903    bnez            a3,      .l_\lable\()prep_hv_8w_loop0_lsx
4904    fld.d           f24,     sp,    0
4905    fld.d           f25,     sp,    8
4906    fld.d           f26,     sp,    16
4907    fld.d           f27,     sp,    24
4908    fld.d           f28,     sp,    32
4909    fld.d           f29,     sp,    40
4910    fld.d           f30,     sp,    48
4911    fld.d           f31,     sp,    56
4912    addi.d          sp,      sp,    8*8
4913    b                .l_\lable\()end_pre_8tap_lsx
4914
4915.l_\lable\()v_lsx:
4916    srli.w           a7,    a7,     2
4917    blt              t0,    a4,     .l_\lable\()v_idx_fv_lsx
4918    andi             a7,    a7,     1
4919    addi.w           a7,    a7,     3
4920.l_\lable\()v_idx_fv_lsx:
4921    addi.w           t5,     zero,  120
4922    mul.w            a7,     a7,    t5
4923    addi.w           t5,     a6,    -1
4924    slli.w           t5,     t5,    3
4925    add.w            a7,     a7,    t5
4926    add.d            a7,     t6,    a7 //fv's offset
4927    vldrepl.d        vr8,    a7,    0
4928
4929    vilvl.h          vr8,    vr8,   vr8
4930    vreplvei.w       vr9,    vr8,   1
4931    vreplvei.w       vr10,   vr8,   2
4932    vreplvei.w       vr11,   vr8,   3
4933    vreplvei.w       vr8,    vr8,   0
4934
4935    sub.d            a1,     a1,    t3
4936    beq              a3,     t0,    .l_\lable\()v_4w_lsx
4937    blt              t0,     a3,    .l_\lable\()v_8w_lsx
4938.l_\lable\()v_4w_lsx:
4939    fld.s            f0,     a1,    0
4940    fldx.s           f1,     a1,    a2
4941    fldx.s           f2,     a1,    t2
4942    add.d            a1,     a1,    t3
4943    fld.s            f3,     a1,    0
4944    fldx.s           f4,     a1,    a2
4945    fldx.s           f5,     a1,    t2
4946    fldx.s           f6,     a1,    t3
4947    add.d            a1,     a1,    t4
4948
4949    vilvl.w          vr0,    vr1,   vr0
4950    vilvl.w          vr1,    vr2,   vr1
4951    vilvl.b          vr0,    vr1,   vr0 //0 1 1 2
4952    vilvl.w          vr1,    vr3,   vr2
4953    vilvl.w          vr2,    vr4,   vr3
4954    vilvl.b          vr1,    vr2,   vr1 //2 3 3 4
4955    vilvl.w          vr2,    vr5,   vr4
4956    vilvl.w          vr3,    vr6,   vr5
4957    vilvl.b          vr2,    vr3,   vr2 //4 5 5 6
4958.l_\lable\()v_4w_loop_lsx:
4959    fld.s            f7,     a1,     0
4960
4961    vilvl.w          vr3,    vr7,   vr6
4962    fldx.s           f6,     a1,    a2
4963    add.d            a1,     a1,    t2
4964    vilvl.w          vr4,    vr6,   vr7
4965    vilvl.b          vr3,    vr4,   vr3 //6 7 7 8
4966
4967    vmulwev.h.bu.b   vr12,   vr0,   vr8
4968    vmulwev.h.bu.b   vr13,   vr1,   vr9
4969    vmulwev.h.bu.b   vr14,   vr2,   vr10
4970    vmulwev.h.bu.b   vr15,   vr3,   vr11
4971    vmaddwod.h.bu.b  vr12,   vr0,   vr8
4972    vmaddwod.h.bu.b  vr13,   vr1,   vr9
4973    vmaddwod.h.bu.b  vr14,   vr2,   vr10
4974    vmaddwod.h.bu.b  vr15,   vr3,   vr11
4975    vaddi.hu         vr0,    vr1,   0
4976    vaddi.hu         vr1,    vr2,   0
4977    vaddi.hu         vr2,    vr3,   0
4978    vadd.h           vr12,   vr12,  vr13
4979    vadd.h           vr12,   vr12,  vr14
4980    vadd.h           vr12,   vr12,  vr15
4981
4982    vsrari.h         vr12,   vr12,  2
4983    vst              vr12,   a0,    0
4984    addi.d           a0,     a0,    16
4985    addi.w           a4,     a4,    -2
4986    bnez             a4,     .l_\lable\()v_4w_loop_lsx
4987    b                .l_\lable\()end_pre_8tap_lsx
4988
4989.l_\lable\()v_8w_lsx:
4990    addi.d           t0,     a1,    0
4991    addi.d           t5,     a4,    0
4992    addi.d           t8,     a0,    0
4993    slli.w           t6,     a3,    1
4994.l_\lable\()v_8w_loop0_lsx:
4995    fld.d            f0,     a1,    0
4996    fldx.d           f1,     a1,    a2
4997    fldx.d           f2,     a1,    t2
4998    add.d            a1,     a1,    t3
4999    fld.d            f3,     a1,    0
5000    fldx.d           f4,     a1,    a2
5001    fldx.d           f5,     a1,    t2
5002    fldx.d           f6,     a1,    t3
5003    add.d            a1,     a1,    t4
5004
5005    vilvl.b          vr0,    vr1,   vr0 //0 1
5006    vilvl.b          vr1,    vr2,   vr1 //1 2
5007    vilvl.b          vr2,    vr3,   vr2 //2 3
5008    vilvl.b          vr3,    vr4,   vr3 //3 4
5009    vilvl.b          vr4,    vr5,   vr4 //4 5
5010    vilvl.b          vr5,    vr6,   vr5 //5 6
5011.l_\lable\()v_8w_loop_lsx:
5012    fld.d            f7,     a1,    0
5013    vilvl.b          vr12,   vr7,   vr6 //6 7
5014    fldx.d           f6,     a1,    a2
5015    add.d            a1,     a1,    t2
5016    vilvl.b          vr13,   vr6,   vr7 //7 8
5017
5018    vmulwev.h.bu.b   vr14,   vr0,   vr8
5019    vmulwev.h.bu.b   vr15,   vr1,   vr8
5020    vmulwev.h.bu.b   vr16,   vr2,   vr9
5021    vmulwev.h.bu.b   vr17,   vr3,   vr9
5022    vmulwev.h.bu.b   vr18,   vr4,   vr10
5023    vmulwev.h.bu.b   vr19,   vr5,   vr10
5024    vmulwev.h.bu.b   vr20,   vr12,  vr11
5025    vmulwev.h.bu.b   vr21,   vr13,  vr11
5026    vmaddwod.h.bu.b  vr14,   vr0,   vr8
5027    vmaddwod.h.bu.b  vr15,   vr1,   vr8
5028    vmaddwod.h.bu.b  vr16,   vr2,   vr9
5029    vmaddwod.h.bu.b  vr17,   vr3,   vr9
5030    vmaddwod.h.bu.b  vr18,   vr4,   vr10
5031    vmaddwod.h.bu.b  vr19,   vr5,   vr10
5032    vmaddwod.h.bu.b  vr20,   vr12,  vr11
5033    vmaddwod.h.bu.b  vr21,   vr13,  vr11
5034
5035    vaddi.hu         vr0,    vr2,   0
5036    vaddi.hu         vr1,    vr3,   0
5037    vaddi.hu         vr2,    vr4,   0
5038    vaddi.hu         vr3,    vr5,   0
5039    vaddi.hu         vr4,    vr12,  0
5040    vaddi.hu         vr5,    vr13,  0
5041    vadd.h           vr14,   vr14,  vr16
5042    vadd.h           vr14,   vr14,  vr18
5043    vadd.h           vr14,   vr14,  vr20
5044    vadd.h           vr15,   vr15,  vr17
5045    vadd.h           vr15,   vr15,  vr19
5046    vadd.h           vr15,   vr15,  vr21
5047
5048    vsrari.h         vr14,   vr14,  2
5049    vsrari.h         vr15,   vr15,  2
5050    vst              vr14,   a0,    0
5051    add.d            a0,     a0,    t6
5052    vst              vr15,   a0,    0
5053    add.d            a0,     a0,    t6
5054    addi.w           a4,     a4,    -2
5055    bnez             a4,     .l_\lable\()v_8w_loop_lsx
5056    addi.d           a1,     t0,    8
5057    addi.d           t0,     t0,    8
5058    addi.d           a0,     t8,    16
5059    addi.d           t8,     t8,    16
5060    addi.d           a4,     t5,    0
5061    addi.d           a3,     a3,    -8
5062    bnez             a3,     .l_\lable\()v_8w_loop0_lsx
5063.l_\lable\()end_pre_8tap_lsx:
5064.endm
5065
5066function prep_8tap_regular_8bpc_lsx
5067    addi.w a7, zero, 0
5068    PREP_8TAP_8BPC_LSX 0
5069endfunc
5070
5071function prep_8tap_smooth_regular_8bpc_lsx
5072    addi.w a7, zero, 1
5073    PREP_8TAP_8BPC_LSX 1
5074endfunc
5075
5076function prep_8tap_sharp_regular_8bpc_lsx
5077    addi.w a7, zero, 2
5078    PREP_8TAP_8BPC_LSX 2
5079endfunc
5080
5081function prep_8tap_regular_smooth_8bpc_lsx
5082    addi.w a7, zero, 4
5083    PREP_8TAP_8BPC_LSX 4
5084endfunc
5085
5086function prep_8tap_smooth_8bpc_lsx
5087    addi.w a7, zero, 5
5088    PREP_8TAP_8BPC_LSX 5
5089endfunc
5090
5091function prep_8tap_sharp_smooth_8bpc_lsx
5092    addi.w a7, zero, 6
5093    PREP_8TAP_8BPC_LSX 6
5094endfunc
5095
5096function prep_8tap_regular_sharp_8bpc_lsx
5097    addi.w a7, zero, 8
5098    PREP_8TAP_8BPC_LSX 8
5099endfunc
5100
5101function prep_8tap_smooth_sharp_8bpc_lsx
5102    addi.w a7, zero, 9
5103    PREP_8TAP_8BPC_LSX 9
5104endfunc
5105
5106function prep_8tap_sharp_8bpc_lsx
5107    addi.w a7, zero, 10
5108    PREP_8TAP_8BPC_LSX 10
5109endfunc
5110
5111/*
5112 * static void blend_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
5113                         const int w, int h, const uint8_t *mask)
5114 */
5115function blend_8bpc_lsx
5116    addi.d        t8,     zero,    64
5117    vreplgr2vr.b  vr23,   t8
5118
5119    clz.w         t0,     a3
5120    li.w          t1,     26
5121    sub.w         t0,     t0,      t1
5122    la.local      t1,     .BLEND_LSX_JRTABLE
5123    alsl.d        t0,     t0,      t1,    1
5124    ld.h          t2,     t0,      0  // The jump addresses are relative to JRTABLE
5125    add.d         t1,     t1,      t2 // Get absolute address
5126    jirl          $r0,    t1,      0
5127
5128    .align   3
5129.BLEND_LSX_JRTABLE:
5130    .hword .BLEND_W32_LSX  - .BLEND_LSX_JRTABLE
5131    .hword .BLEND_W16_LSX  - .BLEND_LSX_JRTABLE
5132    .hword .BLEND_W8_LSX   - .BLEND_LSX_JRTABLE
5133    .hword .BLEND_W4_LSX   - .BLEND_LSX_JRTABLE
5134
5135.BLEND_W4_LSX:
5136    vld             vr0,    a0,      0
5137    vld             vr1,    a2,      0
5138    vld             vr2,    a5,      0
5139
5140    vsllwil.hu.bu   vr1,    vr1,     0
5141    vsllwil.hu.bu   vr4,    vr2,     0
5142    vmul.h          vr1,    vr1,     vr4  //b*m
5143    vsub.b          vr3,    vr23,    vr2
5144    vsllwil.hu.bu   vr0,    vr0,     0
5145    vsllwil.hu.bu   vr3,    vr3,     0
5146    vmadd.h         vr1,    vr0,     vr3
5147    vssrarni.bu.h   vr1,    vr1,     6
5148
5149    vstelm.w        vr1,    a0,      0,   0
5150    addi.w          a4,     a4,      -1
5151    add.d           a0,     a0,      a1
5152    addi.d          a2,     a2,      4
5153    addi.d          a5,     a5,      4
5154
5155    blt             zero,   a4,     .BLEND_W4_LSX
5156    b              .BLEND_END_LSX
5157.BLEND_W8_LSX:
5158    vld             vr0,    a0,      0
5159    vld             vr1,    a2,      0
5160    vld             vr2,    a5,      0
5161
5162    vsllwil.hu.bu   vr1,    vr1,     0
5163    vsllwil.hu.bu   vr4,    vr2,     0
5164    vmul.h          vr1,    vr1,     vr4  //b*m
5165    vsub.b          vr3,    vr23,    vr2
5166    vsllwil.hu.bu   vr0,    vr0,     0
5167    vsllwil.hu.bu   vr3,    vr3,     0
5168    vmadd.h         vr1,    vr0,     vr3
5169    vssrarni.bu.h   vr1,    vr1,     6
5170
5171    vstelm.d        vr1,    a0,      0,   0
5172    addi.w          a4,     a4,      -1
5173    add.d           a0,     a0,      a1
5174    addi.d          a2,     a2,      8
5175    addi.d          a5,     a5,      8
5176
5177    blt             zero,   a4,     .BLEND_W8_LSX
5178    b               .BLEND_END_LSX
5179.BLEND_W16_LSX:
5180    vld             vr0,    a0,      0
5181    vld             vr1,    a2,      0
5182    vld             vr2,    a5,      0
5183
5184    vexth.hu.bu     vr5,    vr1
5185    vsllwil.hu.bu   vr1,    vr1,     0
5186    vexth.hu.bu     vr6,    vr2
5187    vsllwil.hu.bu   vr4,    vr2,     0
5188    vmul.h          vr1,    vr1,     vr4  //b*m
5189    vmul.h          vr5,    vr5,     vr6  //b*m
5190    vsub.b          vr3,    vr23,    vr2
5191    vexth.hu.bu     vr7,    vr0
5192    vexth.hu.bu     vr8,    vr3
5193    vmadd.h         vr5,    vr7,     vr8
5194    vsllwil.hu.bu   vr0,    vr0,     0
5195    vsllwil.hu.bu   vr3,    vr3,     0
5196    vmadd.h         vr1,    vr0,     vr3
5197    vssrarni.bu.h   vr5,    vr1,     6
5198
5199    vst             vr5,    a0,      0
5200    addi.w          a4,     a4,      -1
5201    add.d           a0,     a0,      a1
5202    addi.d          a2,     a2,      16
5203    addi.d          a5,     a5,      16
5204
5205    blt             zero,   a4,     .BLEND_W16_LSX
5206    b               .BLEND_END_LSX
5207.BLEND_W32_LSX:
5208    vld             vr0,    a0,      0
5209    vld             vr1,    a2,      0
5210    vld             vr2,    a5,      0
5211
5212    vexth.hu.bu     vr5,    vr1
5213    vsllwil.hu.bu   vr1,    vr1,     0
5214    vexth.hu.bu     vr6,    vr2
5215    vsllwil.hu.bu   vr4,    vr2,     0
5216    vmul.h          vr1,    vr1,     vr4  //b*m
5217    vmul.h          vr5,    vr5,     vr6  //b*m
5218    vsub.b          vr3,    vr23,    vr2
5219    vexth.hu.bu     vr7,    vr0
5220    vexth.hu.bu     vr8,    vr3
5221    vmadd.h         vr5,    vr7,     vr8
5222    vsllwil.hu.bu   vr0,    vr0,     0
5223    vsllwil.hu.bu   vr3,    vr3,     0
5224    vmadd.h         vr1,    vr0,     vr3
5225    vssrarni.bu.h   vr5,    vr1,     6
5226
5227    vst             vr5,    a0,      0
5228
5229    /* sencond */
5230    vld             vr0,    a0,      16
5231    vld             vr1,    a2,      16
5232    vld             vr2,    a5,      16
5233
5234    vexth.hu.bu     vr5,    vr1
5235    vsllwil.hu.bu   vr1,    vr1,     0
5236    vexth.hu.bu     vr6,    vr2
5237    vsllwil.hu.bu   vr4,    vr2,     0
5238    vmul.h          vr1,    vr1,     vr4  //b*m
5239    vmul.h          vr5,    vr5,     vr6  //b*m
5240    vsub.b          vr3,    vr23,    vr2
5241    vexth.hu.bu     vr7,    vr0
5242    vexth.hu.bu     vr8,    vr3
5243    vmadd.h         vr5,    vr7,     vr8
5244    vsllwil.hu.bu   vr0,    vr0,     0
5245    vsllwil.hu.bu   vr3,    vr3,     0
5246    vmadd.h         vr1,    vr0,     vr3
5247    vssrarni.bu.h   vr5,    vr1,     6
5248
5249    vst             vr5,    a0,      16
5250    addi.w          a4,     a4,      -1
5251    add.d           a0,     a0,      a1
5252    addi.d          a2,     a2,      32
5253    addi.d          a5,     a5,      32
5254
5255    blt             zero,   a4,     .BLEND_W32_LSX
5256.BLEND_END_LSX:
5257
5258endfunc
5259
5260const obmc_masks_la
5261/* Unused */
5262.byte 0,  0,  0,  0
5263/* 2 */
5264.byte 45, 19, 64, 0
5265/* 4 */
5266.byte 39, 25, 50, 14, 59,  5, 64,  0
5267/* 8 */
5268.byte 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
5269/* 16 */
5270.byte 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
5271.byte 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
5272/* 32 */
5273.byte 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
5274.byte 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
5275.byte 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
5276endconst
5277
5278/*
5279 * static void blend_v_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
5280                           const int w, int h)
5281 */
5282function blend_v_8bpc_lsx
5283    la.local      t8,     obmc_masks_la
5284
5285    clz.w         t0,     a3
5286    li.w          t1,     26
5287    sub.w         t0,     t0,      t1
5288    la.local      t1,     .BLEND_V_LSX_JRTABLE
5289    alsl.d        t0,     t0,      t1,    1
5290    ld.h          t2,     t0,      0  // The jump addresses are relative to JRTABLE
5291    add.d         t1,     t1,      t2 // Get absolute address
5292    jirl          $r0,    t1,      0
5293
5294    .align   3
5295.BLEND_V_LSX_JRTABLE:
5296    .hword .BLEND_V_W32_LSX  - .BLEND_V_LSX_JRTABLE
5297    .hword .BLEND_V_W16_LSX  - .BLEND_V_LSX_JRTABLE
5298    .hword .BLEND_V_W8_LSX   - .BLEND_V_LSX_JRTABLE
5299    .hword .BLEND_V_W4_LSX   - .BLEND_V_LSX_JRTABLE
5300    .hword .BLEND_V_W2_LSX   - .BLEND_V_LSX_JRTABLE
5301    .hword .BLEND_V_W2_LSX_1 - .BLEND_V_LSX_JRTABLE  //Instructions must be 4-byte aligned
5302
5303.BLEND_V_W2_LSX:
5304    ld.bu           t6,     t8,      4
5305    ld.bu           t7,     t8,      5
5306
5307.BLEND_V_W2_LSX_1:
5308    ld.bu           t0,     a0,      0
5309    ld.bu           t1,     a2,      0
5310    mul.d           t0,     t0,      t6
5311    mul.d           t1,     t1,      t7
5312    addi.d          t0,     t0,      32
5313    add.d           t0,     t0,      t1
5314    srli.d          t0,     t0,      6
5315    st.b            t0,     a0,      0
5316
5317    addi.w          a4,     a4,      -1
5318    add.d           a0,     a0,      a1
5319    addi.d          a2,     a2,      2
5320    addi.d          a5,     a5,      2
5321
5322    blt             zero,   a4,     .BLEND_V_W2_LSX_1
5323    b               .BLEND_V_END_LSX
5324
5325.BLEND_V_W4_LSX:
5326    vld             vr20,   t8,      8
5327
5328.BLEND_V_W4_LSX_1:
5329    vld             vr0,    a0,      0
5330    vld             vr1,    a2,      0
5331
5332    vilvl.b         vr0,    vr1,     vr0
5333    vdp2.h.bu       vr1,    vr0,     vr20
5334    vssrarni.bu.h   vr1,    vr1,     6
5335
5336    vstelm.h        vr1,    a0,      0,   0
5337    vstelm.b        vr1,    a0,      2,   2
5338    addi.w          a4,     a4,      -1
5339    add.d           a0,     a0,      a1
5340    addi.d          a2,     a2,      4
5341
5342    blt             zero,   a4,     .BLEND_V_W4_LSX_1
5343    b              .BLEND_V_END_LSX
5344
5345.BLEND_V_W8_LSX:
5346    vld             vr20,   t8,      16
5347
5348.BLEND_V_W8_LSX_1:
5349    vld             vr0,    a0,      0
5350    vld             vr1,    a2,      0
5351
5352    vilvl.b         vr0,    vr1,     vr0
5353    vdp2.h.bu       vr1,    vr0,     vr20
5354    vssrarni.bu.h   vr1,    vr1,     6
5355
5356    vstelm.w        vr1,    a0,      0,   0
5357    vstelm.h        vr1,    a0,      4,   2
5358    addi.w          a4,     a4,      -1
5359    add.d           a0,     a0,      a1
5360    addi.d          a2,     a2,      8
5361
5362    blt             zero,   a4,     .BLEND_V_W8_LSX_1
5363    b              .BLEND_V_END_LSX
5364
5365.BLEND_V_W16_LSX:
5366    vld             vr20,   t8,      32
5367    vld             vr21,   t8,      48
5368
5369.BLEND_V_W16_LSX_1:
5370    vld             vr0,    a0,      0
5371    vld             vr1,    a2,      0
5372
5373    vilvl.b         vr2,    vr1,     vr0
5374    vilvh.b         vr3,    vr1,     vr0
5375    vmulwev.h.bu    vr4,    vr2,     vr20
5376    vmulwev.h.bu    vr5,    vr3,     vr21
5377    vmaddwod.h.bu   vr4,    vr2,     vr20
5378    vmaddwod.h.bu   vr5,    vr3,     vr21
5379    vssrarni.bu.h   vr5,    vr4,     6
5380
5381    vstelm.d        vr5,    a0,      0,   0
5382    vstelm.w        vr5,    a0,      8,   2
5383    addi.w          a4,     a4,      -1
5384    add.d           a0,     a0,      a1
5385    addi.d          a2,     a2,      16
5386
5387    blt             zero,   a4,     .BLEND_V_W16_LSX_1
5388    b              .BLEND_V_END_LSX
5389
5390.BLEND_V_W32_LSX:
5391    vld             vr20,   t8,      64
5392    vld             vr21,   t8,      80
5393    vld             vr22,   t8,      96
5394
5395.BLEND_V_W32_LSX_1:
5396    vld             vr0,    a0,      0
5397    vld             vr1,    a0,      16
5398    vld             vr2,    a2,      0
5399    vld             vr3,    a2,      16
5400
5401    vilvl.b         vr4,    vr2,     vr0
5402    vmulwev.h.bu    vr7,    vr4,     vr20
5403    vilvh.b         vr5,    vr2,     vr0
5404    vmulwev.h.bu    vr8,    vr5,     vr21
5405    vilvl.b         vr6,    vr3,     vr1
5406    vmulwev.h.bu    vr9,    vr6,     vr22
5407    vmaddwod.h.bu   vr7,    vr4,     vr20
5408    vmaddwod.h.bu   vr8,    vr5,     vr21
5409    vmaddwod.h.bu   vr9,    vr6,     vr22
5410    vssrarni.bu.h   vr8,    vr7,     6
5411    vssrarni.bu.h   vr9,    vr9,     6
5412
5413    vst             vr8,    a0,      0
5414    vstelm.d        vr9,    a0,      16,   0
5415    addi.w          a4,     a4,      -1
5416    add.d           a0,     a0,      a1
5417    addi.d          a2,     a2,      32
5418
5419    blt             zero,   a4,     .BLEND_V_W32_LSX_1
5420
5421.BLEND_V_END_LSX:
5422
5423endfunc
5424
5425/*
5426 * static void blend_h_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
5427                           const int w, int h)
5428 */
5429function blend_h_8bpc_lsx
5430    la.local      t8,     obmc_masks_la
5431    alsl.d        t8,     a4,      t8,    1
5432    srli.d        t0,     a4,      1
5433    srli.d        t1,     a4,      2
5434    add.d         a4,     t0,      t1  // h = (h * 3) >> 2;
5435    slli.d        a4,     a4,      1
5436    add.d         a4,     a4,      t8
5437
5438    clz.w         t0,     a3
5439    li.w          t1,     24
5440    sub.w         t0,     t0,      t1
5441    la.local      t1,     .BLEND_H_LSX_JRTABLE
5442    alsl.d        t0,     t0,      t1,    1
5443    ld.h          t2,     t0,      0  // The jump addresses are relative to JRTABLE
5444    add.d         t1,     t1,      t2 // Get absolute address
5445    jirl          $r0,    t1,      0
5446
5447    .align   3
5448.BLEND_H_LSX_JRTABLE:
5449    .hword .BLEND_H_W128_LSX - .BLEND_H_LSX_JRTABLE
5450    .hword .BLEND_H_W64_LSX  - .BLEND_H_LSX_JRTABLE
5451    .hword .BLEND_H_W32_LSX  - .BLEND_H_LSX_JRTABLE
5452    .hword .BLEND_H_W16_LSX  - .BLEND_H_LSX_JRTABLE
5453    .hword .BLEND_H_W8_LSX   - .BLEND_H_LSX_JRTABLE
5454    .hword .BLEND_H_W4_LSX   - .BLEND_H_LSX_JRTABLE
5455    .hword .BLEND_H_W2_LSX   - .BLEND_H_LSX_JRTABLE
5456    .hword .BLEND_H_END_LSX  - .BLEND_H_LSX_JRTABLE  //Instructions must be 4-byte aligned
5457
5458.BLEND_H_W2_LSX:
5459    vldrepl.h       vr20,   t8,      0
5460    vld             vr0,    a0,      0
5461    vld             vr1,    a2,      0
5462
5463    vilvl.b         vr0,    vr1,     vr0
5464    vdp2.h.bu       vr1,    vr0,     vr20
5465    vssrarni.bu.h   vr1,    vr1,     6
5466
5467    vstelm.h        vr1,    a0,      0,   0
5468    addi.d          t8,     t8,      2
5469    add.d           a0,     a0,      a1
5470    addi.d          a2,     a2,      2
5471
5472    blt             t8,     a4,     .BLEND_H_W2_LSX
5473    b               .BLEND_H_END_LSX
5474
5475.BLEND_H_W4_LSX:
5476    vldrepl.h       vr20,   t8,      0
5477    vld             vr0,    a0,      0
5478    vld             vr1,    a2,      0
5479
5480    vilvl.b         vr0,    vr1,     vr0
5481    vdp2.h.bu       vr1,    vr0,     vr20
5482    vssrarni.bu.h   vr1,    vr1,     6
5483
5484    vstelm.w        vr1,    a0,      0,   0
5485    addi.d          t8,     t8,      2
5486    add.d           a0,     a0,      a1
5487    addi.d          a2,     a2,      4
5488
5489    blt             t8,     a4,     .BLEND_H_W4_LSX
5490    b               .BLEND_H_END_LSX
5491
5492.BLEND_H_W8_LSX:
5493    vldrepl.h       vr20,   t8,      0
5494    vld             vr0,    a0,      0
5495    vld             vr1,    a2,      0
5496
5497    vilvl.b         vr0,    vr1,     vr0
5498    vdp2.h.bu       vr1,    vr0,     vr20
5499    vssrarni.bu.h   vr1,    vr1,     6
5500
5501    vstelm.d        vr1,    a0,      0,   0
5502    addi.d          t8,     t8,      2
5503    add.d           a0,     a0,      a1
5504    addi.d          a2,     a2,      8
5505
5506    blt             t8,     a4,     .BLEND_H_W8_LSX
5507    b               .BLEND_H_END_LSX
5508
5509.BLEND_H_W16_LSX:
5510    vldrepl.h       vr20,   t8,      0
5511    vld             vr0,    a0,      0
5512    vld             vr1,    a2,      0
5513
5514    vilvl.b         vr2,    vr1,     vr0
5515    vilvh.b         vr3,    vr1,     vr0
5516    vmulwev.h.bu    vr4,    vr2,     vr20
5517    vmulwev.h.bu    vr5,    vr3,     vr20
5518    vmaddwod.h.bu   vr4,    vr2,     vr20
5519    vmaddwod.h.bu   vr5,    vr3,     vr20
5520    vssrarni.bu.h   vr5,    vr4,     6
5521
5522    vst             vr5,    a0,      0
5523    addi.d          t8,     t8,      2
5524    add.d           a0,     a0,      a1
5525    addi.d          a2,     a2,      16
5526
5527    blt             t8,     a4,     .BLEND_H_W16_LSX
5528    b               .BLEND_H_END_LSX
5529
5530.BLEND_H_W32_LSX:
5531    vldrepl.h       vr20,   t8,      0
5532
5533    vld             vr0,    a0,      0
5534    vld             vr1,    a0,      16
5535    vld             vr2,    a2,      0
5536    vld             vr3,    a2,      16
5537
5538    vilvl.b         vr4,    vr2,     vr0
5539    vilvh.b         vr5,    vr2,     vr0
5540    vilvl.b         vr6,    vr3,     vr1
5541    vilvh.b         vr3,    vr3,     vr1
5542    vmulwev.h.bu    vr7,    vr4,     vr20
5543    vmulwev.h.bu    vr8,    vr5,     vr20
5544    vmulwev.h.bu    vr9,    vr6,     vr20
5545    vmulwev.h.bu    vr0,    vr3,     vr20
5546    vmaddwod.h.bu   vr7,    vr4,     vr20
5547    vmaddwod.h.bu   vr8,    vr5,     vr20
5548    vmaddwod.h.bu   vr9,    vr6,     vr20
5549    vmaddwod.h.bu   vr0,    vr3,     vr20
5550    vssrarni.bu.h   vr8,    vr7,     6
5551    vssrarni.bu.h   vr0,    vr9,     6
5552
5553    vst             vr8,    a0,      0
5554    vst             vr0,    a0,      16
5555    addi.d          t8,     t8,      2
5556    add.d           a0,     a0,      a1
5557    addi.d          a2,     a2,      32
5558
5559    blt             t8,     a4,     .BLEND_H_W32_LSX
5560    b               .BLEND_H_END_LSX
5561
5562.BLEND_H_W64_LSX:
5563    vldrepl.h       vr20,   t8,      0
5564
5565    vld             vr0,    a0,      0
5566    vld             vr1,    a0,      16
5567    vld             vr2,    a0,      32
5568    vld             vr3,    a0,      48
5569    vld             vr4,    a2,      0
5570    vld             vr5,    a2,      16
5571    vld             vr6,    a2,      32
5572    vld             vr7,    a2,      48
5573
5574    vilvl.b         vr8,    vr4,     vr0
5575    vilvh.b         vr9,    vr4,     vr0
5576    vilvl.b         vr10,   vr5,     vr1
5577    vilvh.b         vr11,   vr5,     vr1
5578    vilvl.b         vr12,   vr6,     vr2
5579    vilvh.b         vr13,   vr6,     vr2
5580    vilvl.b         vr14,   vr7,     vr3
5581    vilvh.b         vr15,   vr7,     vr3
5582    vmulwev.h.bu    vr0,    vr8,     vr20
5583    vmulwev.h.bu    vr1,    vr9,     vr20
5584    vmulwev.h.bu    vr2,    vr10,    vr20
5585    vmulwev.h.bu    vr3,    vr11,    vr20
5586    vmulwev.h.bu    vr4,    vr12,    vr20
5587    vmulwev.h.bu    vr5,    vr13,    vr20
5588    vmulwev.h.bu    vr6,    vr14,    vr20
5589    vmulwev.h.bu    vr7,    vr15,    vr20
5590
5591    vmaddwod.h.bu   vr0,    vr8,     vr20
5592    vmaddwod.h.bu   vr1,    vr9,     vr20
5593    vmaddwod.h.bu   vr2,    vr10,    vr20
5594    vmaddwod.h.bu   vr3,    vr11,    vr20
5595    vmaddwod.h.bu   vr4,    vr12,    vr20
5596    vmaddwod.h.bu   vr5,    vr13,    vr20
5597    vmaddwod.h.bu   vr6,    vr14,    vr20
5598    vmaddwod.h.bu   vr7,    vr15,    vr20
5599
5600    vssrarni.bu.h   vr1,    vr0,     6
5601    vssrarni.bu.h   vr3,    vr2,     6
5602    vssrarni.bu.h   vr5,    vr4,     6
5603    vssrarni.bu.h   vr7,    vr6,     6
5604
5605    vst             vr1,    a0,      0
5606    vst             vr3,    a0,      16
5607    vst             vr5,    a0,      32
5608    vst             vr7,    a0,      48
5609    addi.d          t8,     t8,      2
5610    add.d           a0,     a0,      a1
5611    addi.d          a2,     a2,      64
5612
5613    blt             t8,     a4,     .BLEND_H_W64_LSX
5614    b               .BLEND_H_END_LSX
5615
5616.BLEND_H_W128_LSX:
5617    vldrepl.h       vr20,   t8,      0
5618
5619    vld             vr0,    a0,      0
5620    vld             vr1,    a0,      16
5621    vld             vr2,    a0,      32
5622    vld             vr3,    a0,      48
5623    vld             vr4,    a2,      0
5624    vld             vr5,    a2,      16
5625    vld             vr6,    a2,      32
5626    vld             vr7,    a2,      48
5627
5628    vilvl.b         vr8,    vr4,     vr0
5629    vilvh.b         vr9,    vr4,     vr0
5630    vilvl.b         vr10,   vr5,     vr1
5631    vilvh.b         vr11,   vr5,     vr1
5632    vilvl.b         vr12,   vr6,     vr2
5633    vilvh.b         vr13,   vr6,     vr2
5634    vilvl.b         vr14,   vr7,     vr3
5635    vilvh.b         vr15,   vr7,     vr3
5636    vmulwev.h.bu    vr0,    vr8,     vr20
5637    vmulwev.h.bu    vr1,    vr9,     vr20
5638    vmulwev.h.bu    vr2,    vr10,    vr20
5639    vmulwev.h.bu    vr3,    vr11,    vr20
5640    vmulwev.h.bu    vr4,    vr12,    vr20
5641    vmulwev.h.bu    vr5,    vr13,    vr20
5642    vmulwev.h.bu    vr6,    vr14,    vr20
5643    vmulwev.h.bu    vr7,    vr15,    vr20
5644
5645    vmaddwod.h.bu   vr0,    vr8,     vr20
5646    vmaddwod.h.bu   vr1,    vr9,     vr20
5647    vmaddwod.h.bu   vr2,    vr10,    vr20
5648    vmaddwod.h.bu   vr3,    vr11,    vr20
5649    vmaddwod.h.bu   vr4,    vr12,    vr20
5650    vmaddwod.h.bu   vr5,    vr13,    vr20
5651    vmaddwod.h.bu   vr6,    vr14,    vr20
5652    vmaddwod.h.bu   vr7,    vr15,    vr20
5653
5654    vssrarni.bu.h   vr1,    vr0,     6
5655    vssrarni.bu.h   vr3,    vr2,     6
5656    vssrarni.bu.h   vr5,    vr4,     6
5657    vssrarni.bu.h   vr7,    vr6,     6
5658
5659    vst             vr1,    a0,      0
5660    vst             vr3,    a0,      16
5661    vst             vr5,    a0,      32
5662    vst             vr7,    a0,      48
5663
5664    /* second */
5665    vld             vr0,    a0,      64
5666    vld             vr1,    a0,      80
5667    vld             vr2,    a0,      96
5668    vld             vr3,    a0,      112
5669    vld             vr4,    a2,      64
5670    vld             vr5,    a2,      80
5671    vld             vr6,    a2,      96
5672    vld             vr7,    a2,      112
5673
5674    vilvl.b         vr8,    vr4,     vr0
5675    vilvh.b         vr9,    vr4,     vr0
5676    vilvl.b         vr10,   vr5,     vr1
5677    vilvh.b         vr11,   vr5,     vr1
5678    vilvl.b         vr12,   vr6,     vr2
5679    vilvh.b         vr13,   vr6,     vr2
5680    vilvl.b         vr14,   vr7,     vr3
5681    vilvh.b         vr15,   vr7,     vr3
5682    vmulwev.h.bu    vr0,    vr8,     vr20
5683    vmulwev.h.bu    vr1,    vr9,     vr20
5684    vmulwev.h.bu    vr2,    vr10,    vr20
5685    vmulwev.h.bu    vr3,    vr11,    vr20
5686    vmulwev.h.bu    vr4,    vr12,    vr20
5687    vmulwev.h.bu    vr5,    vr13,    vr20
5688    vmulwev.h.bu    vr6,    vr14,    vr20
5689    vmulwev.h.bu    vr7,    vr15,    vr20
5690
5691    vmaddwod.h.bu   vr0,    vr8,     vr20
5692    vmaddwod.h.bu   vr1,    vr9,     vr20
5693    vmaddwod.h.bu   vr2,    vr10,    vr20
5694    vmaddwod.h.bu   vr3,    vr11,    vr20
5695    vmaddwod.h.bu   vr4,    vr12,    vr20
5696    vmaddwod.h.bu   vr5,    vr13,    vr20
5697    vmaddwod.h.bu   vr6,    vr14,    vr20
5698    vmaddwod.h.bu   vr7,    vr15,    vr20
5699
5700    vssrarni.bu.h   vr1,    vr0,     6
5701    vssrarni.bu.h   vr3,    vr2,     6
5702    vssrarni.bu.h   vr5,    vr4,     6
5703    vssrarni.bu.h   vr7,    vr6,     6
5704
5705    vst             vr1,    a0,      64
5706    vst             vr3,    a0,      80
5707    vst             vr5,    a0,      96
5708    vst             vr7,    a0,      112
5709
5710    addi.d          t8,     t8,      2
5711    add.d           a0,     a0,      a1
5712    addi.d          a2,     a2,      128
5713
5714    blt             t8,     a4,     .BLEND_H_W128_LSX
5715    b               .BLEND_H_END_LSX
5716
5717.BLEND_H_END_LSX:
5718
5719endfunc
5720
5721/*
5722 * static void blend_h_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
5723                           const int w, int h)
5724 */
5725function blend_h_8bpc_lasx
5726    la.local      t8,     obmc_masks_la
5727    alsl.d        t8,     a4,      t8,    1
5728    srli.d        t0,     a4,      1
5729    srli.d        t1,     a4,      2
5730    add.d         a4,     t0,      t1  // h = (h * 3) >> 2;
5731    slli.d        a4,     a4,      1
5732    add.d         a4,     a4,      t8
5733
5734    clz.w         t0,     a3
5735    li.w          t1,     24
5736    sub.w         t0,     t0,      t1
5737    la.local      t1,     .BLEND_H_LASX_JRTABLE
5738    alsl.d        t0,     t0,      t1,    1
5739    ld.h          t2,     t0,      0  // The jump addresses are relative to JRTABLE
5740    add.d         t1,     t1,      t2 // Get absolute address
5741    jirl          $r0,    t1,      0
5742
5743    .align   3
5744.BLEND_H_LASX_JRTABLE:
5745    .hword .BLEND_H_W128_LASX - .BLEND_H_LASX_JRTABLE
5746    .hword .BLEND_H_W64_LASX  - .BLEND_H_LASX_JRTABLE
5747    .hword .BLEND_H_W32_LASX  - .BLEND_H_LASX_JRTABLE
5748    .hword .BLEND_H_W16_LASX  - .BLEND_H_LASX_JRTABLE
5749    .hword .BLEND_H_W8_LASX   - .BLEND_H_LASX_JRTABLE
5750    .hword .BLEND_H_W4_LASX   - .BLEND_H_LASX_JRTABLE
5751    .hword .BLEND_H_W2_LASX   - .BLEND_H_LASX_JRTABLE
5752    .hword .BLEND_H_END_LASX  - .BLEND_H_LASX_JRTABLE  //Instructions must be 4-byte aligned
5753
5754.BLEND_H_W2_LASX:
5755    vldrepl.h       vr20,   t8,      0
5756    vld             vr0,    a0,      0
5757    vld             vr1,    a2,      0
5758
5759    vilvl.b         vr0,    vr1,     vr0
5760    vdp2.h.bu       vr1,    vr0,     vr20
5761    vssrarni.bu.h   vr1,    vr1,     6
5762
5763    vstelm.h        vr1,    a0,      0,   0
5764    addi.d          t8,     t8,      2
5765    add.d           a0,     a0,      a1
5766    addi.d          a2,     a2,      2
5767
5768    blt             t8,     a4,     .BLEND_H_W2_LASX
5769    b               .BLEND_H_END_LASX
5770
5771.BLEND_H_W4_LASX:
5772    vldrepl.h       vr20,   t8,      0
5773    vld             vr0,    a0,      0
5774    vld             vr1,    a2,      0
5775
5776    vilvl.b         vr0,    vr1,     vr0
5777    vdp2.h.bu       vr1,    vr0,     vr20
5778    vssrarni.bu.h   vr1,    vr1,     6
5779
5780    vstelm.w        vr1,    a0,      0,   0
5781    addi.d          t8,     t8,      2
5782    add.d           a0,     a0,      a1
5783    addi.d          a2,     a2,      4
5784
5785    blt             t8,     a4,     .BLEND_H_W4_LASX
5786    b               .BLEND_H_END_LASX
5787
5788.BLEND_H_W8_LASX:
5789    vldrepl.h       vr20,   t8,      0
5790    vld             vr0,    a0,      0
5791    vld             vr1,    a2,      0
5792
5793    vilvl.b         vr0,    vr1,     vr0
5794    vdp2.h.bu       vr1,    vr0,     vr20
5795    vssrarni.bu.h   vr1,    vr1,     6
5796
5797    vstelm.d        vr1,    a0,      0,   0
5798    addi.d          t8,     t8,      2
5799    add.d           a0,     a0,      a1
5800    addi.d          a2,     a2,      8
5801
5802    blt             t8,     a4,     .BLEND_H_W8_LASX
5803    b               .BLEND_H_END_LASX
5804
5805.BLEND_H_W16_LASX:
5806    vldrepl.h       vr20,   t8,      0
5807    vld             vr0,    a0,      0
5808    vld             vr1,    a2,      0
5809
5810    vilvl.b         vr2,    vr1,     vr0
5811    vilvh.b         vr3,    vr1,     vr0
5812    vmulwev.h.bu    vr4,    vr2,     vr20
5813    vmulwev.h.bu    vr5,    vr3,     vr20
5814    vmaddwod.h.bu   vr4,    vr2,     vr20
5815    vmaddwod.h.bu   vr5,    vr3,     vr20
5816    vssrarni.bu.h   vr5,    vr4,     6
5817
5818    vst             vr5,    a0,      0
5819    addi.d          t8,     t8,      2
5820    add.d           a0,     a0,      a1
5821    addi.d          a2,     a2,      16
5822
5823    blt             t8,     a4,     .BLEND_H_W16_LSX
5824    b               .BLEND_H_END_LSX
5825
5826.BLEND_H_W32_LASX:
5827    xvldrepl.h      xr20,   t8,      0
5828
5829    xvld            xr0,    a0,      0
5830    xvld            xr1,    a2,      0
5831
5832    xvilvl.b        xr2,    xr1,     xr0
5833    xvilvh.b        xr3,    xr1,     xr0
5834
5835    xvmulwev.h.bu   xr4,    xr2,     xr20
5836    xvmulwev.h.bu   xr5,    xr3,     xr20
5837    xvmaddwod.h.bu  xr4,    xr2,     xr20
5838    xvmaddwod.h.bu  xr5,    xr3,     xr20
5839    xvssrarni.bu.h  xr5,    xr4,     6
5840
5841    xvst            xr5,    a0,      0
5842    addi.d          t8,     t8,      2
5843    add.d           a0,     a0,      a1
5844    addi.d          a2,     a2,      32
5845
5846    blt             t8,     a4,     .BLEND_H_W32_LASX
5847    b               .BLEND_H_END_LASX
5848
5849.BLEND_H_W64_LASX:
5850    xvldrepl.h      xr20,   t8,      0
5851
5852    xvld            xr0,    a0,      0
5853    xvld            xr1,    a0,      32
5854    xvld            xr2,    a2,      0
5855    xvld            xr3,    a2,      32
5856
5857    xvilvl.b        xr4,    xr2,     xr0
5858    xvilvh.b        xr5,    xr2,     xr0
5859    xvilvl.b        xr6,    xr3,     xr1
5860    xvilvh.b        xr7,    xr3,     xr1
5861
5862    xvmulwev.h.bu   xr0,    xr4,     xr20
5863    xvmulwev.h.bu   xr1,    xr5,     xr20
5864    xvmulwev.h.bu   xr2,    xr6,     xr20
5865    xvmulwev.h.bu   xr3,    xr7,     xr20
5866    xvmaddwod.h.bu  xr0,    xr4,     xr20
5867    xvmaddwod.h.bu  xr1,    xr5,     xr20
5868    xvmaddwod.h.bu  xr2,    xr6,     xr20
5869    xvmaddwod.h.bu  xr3,    xr7,     xr20
5870    xvssrarni.bu.h  xr1,    xr0,     6
5871    xvssrarni.bu.h  xr3,    xr2,     6
5872
5873    xvst            xr1,    a0,      0
5874    xvst            xr3,    a0,      32
5875    addi.d          t8,     t8,      2
5876    add.d           a0,     a0,      a1
5877    addi.d          a2,     a2,      64
5878
5879    blt             t8,     a4,     .BLEND_H_W64_LASX
5880    b               .BLEND_H_END_LASX
5881
5882.BLEND_H_W128_LASX:
5883    xvldrepl.h      xr20,   t8,      0
5884
5885    xvld            xr0,    a0,      0
5886    xvld            xr1,    a0,      32
5887    xvld            xr2,    a0,      64
5888    xvld            xr3,    a0,      96
5889    xvld            xr4,    a2,      0
5890    xvld            xr5,    a2,      32
5891    xvld            xr6,    a2,      64
5892    xvld            xr7,    a2,      96
5893
5894    xvilvl.b        xr8,    xr4,     xr0
5895    xvilvh.b        xr9,    xr4,     xr0
5896    xvilvl.b        xr10,   xr5,     xr1
5897    xvilvh.b        xr11,   xr5,     xr1
5898    xvilvl.b        xr12,   xr6,     xr2
5899    xvilvh.b        xr13,   xr6,     xr2
5900    xvilvl.b        xr14,   xr7,     xr3
5901    xvilvh.b        xr15,   xr7,     xr3
5902
5903    xvmulwev.h.bu   xr0,    xr8,     xr20
5904    xvmulwev.h.bu   xr1,    xr9,     xr20
5905    xvmulwev.h.bu   xr2,    xr10,    xr20
5906    xvmulwev.h.bu   xr3,    xr11,    xr20
5907    xvmulwev.h.bu   xr4,    xr12,    xr20
5908    xvmulwev.h.bu   xr5,    xr13,    xr20
5909    xvmulwev.h.bu   xr6,    xr14,    xr20
5910    xvmulwev.h.bu   xr7,    xr15,    xr20
5911    xvmaddwod.h.bu  xr0,    xr8,     xr20
5912    xvmaddwod.h.bu  xr1,    xr9,     xr20
5913    xvmaddwod.h.bu  xr2,    xr10,    xr20
5914    xvmaddwod.h.bu  xr3,    xr11,    xr20
5915    xvmaddwod.h.bu  xr4,    xr12,    xr20
5916    xvmaddwod.h.bu  xr5,    xr13,    xr20
5917    xvmaddwod.h.bu  xr6,    xr14,    xr20
5918    xvmaddwod.h.bu  xr7,    xr15,    xr20
5919    xvssrarni.bu.h  xr1,    xr0,     6
5920    xvssrarni.bu.h  xr3,    xr2,     6
5921    xvssrarni.bu.h  xr5,    xr4,     6
5922    xvssrarni.bu.h  xr7,    xr6,     6
5923
5924    xvst            xr1,    a0,      0
5925    xvst            xr3,    a0,      32
5926    xvst            xr5,    a0,      64
5927    xvst            xr7,    a0,      96
5928    addi.d          t8,     t8,      2
5929    add.d           a0,     a0,      a1
5930    addi.d          a2,     a2,      128
5931
5932    blt             t8,     a4,     .BLEND_H_W128_LASX
5933    b               .BLEND_H_END_LASX
5934
5935.BLEND_H_END_LASX:
5936
5937endfunc
5938
5939/*
5940 *  a1=16 | a2=8 | a3=4
5941 *  temp reg: a4
5942 */
5943.macro PIXEL_COPY_LSX _dst, _src, _size
5944    blt             \_size,  a1,     8f
594516:
5946    vld             vr0,     \_src,  0
5947    vst             vr0,     \_dst,  0
5948    addi.d          \_size,  \_size, -16
5949    addi.d          \_dst,   \_dst,  16
5950    addi.d          \_src,   \_src,  16
5951    blt             a1,      \_size, 16b
59528:
5953    blt             \_size,  a2,     14f
5954    ld.d            a4,      \_src,  0
5955    st.d            a4,      \_dst,  0
5956    addi.d          \_size,  \_size, -8
5957    addi.d          \_dst,   \_dst,  8
5958    addi.d          \_src,   \_src,  8
595914:
5960    blt             \_size,  a3,     11f
5961    ld.w            a4,      \_src,  0
5962    st.w            a4,      \_dst,  0
5963    addi.d          \_size,  \_size, -4
5964    addi.d          \_dst,   \_dst,  4
5965    addi.d          \_src,   \_src,  4
596611:
5967    beqz            \_size,  110f
5968111:
5969    ld.b            a4,      \_src,  0
5970    st.b            a4,      \_dst,  0
5971    addi.d          \_size,  \_size, -1
5972    addi.d          \_dst,   \_dst,  1
5973    addi.d          \_src,   \_src,  1
5974    bnez            \_size,  111b
5975110:
5976.endm
5977
5978/*
5979 *  a1=16 | a2=8 | a3=4
5980 */
5981.macro PIXEL_SET_LSX _dst, _vsrc, _size
5982    blt             \_size,  a1,     8f
598316:
5984    vst             \_vsrc,  \_dst,  0
5985    addi.d          \_size,  \_size, -16
5986    addi.d          \_dst,   \_dst,  16
5987    blt             a1,      \_size, 16b
59888:
5989    blt             \_size,  a2,     14f
5990    vstelm.d        \_vsrc,  \_dst,  0,   0
5991    addi.d          \_size,  \_size, -8
5992    addi.d          \_dst,   \_dst,  8
599314:
5994    blt             \_size,  a3,     11f
5995    vstelm.w        \_vsrc,  \_dst,  0,   0
5996    addi.d          \_size,  \_size, -4
5997    addi.d          \_dst,   \_dst,  4
599811:
5999    beqz            \_size,  110f
6000111:
6001    vstelm.b        \_vsrc,  \_dst,  0,   0
6002    addi.d          \_size,  \_size, -1
6003    addi.d          \_dst,   \_dst,  1
6004    bnez            \_size,  111b
6005110:
6006.endm
6007
6008/*
6009 *  temp reg: a4 a5 t2 t3 vr0
6010 */
6011.macro DEGE_LOOP need_left, need_right
60120:
6013    addi.d          t2,      t6,     0   // dst
6014    addi.d          t3,      t7,     0   // src
6015.if \need_left
6016    vldrepl.b       vr0,     t3,     0
6017    addi.d          a5,      t0,     0
6018    PIXEL_SET_LSX t2, vr0, a5
6019.endif
6020
6021    addi.d          a5,      t4,     0
6022    PIXEL_COPY_LSX t2, t3, a5
6023
6024.if \need_right
6025    vldrepl.b       vr0,     t3,     -1
6026    addi.d          a5,      t1,     0
6027    PIXEL_SET_LSX t2, vr0, a5
6028.endif
6029
6030    addi.d          t5,      t5,     -1
6031    add.d           t7,      t7,     t8
6032    add.d           t6,      t6,     a7
6033    bnez            t5,      0b
6034.endm
6035
6036/*
6037 * static void emu_edge_c(const intptr_t bw, const intptr_t bh,
6038 *                        const intptr_t iw, const intptr_t ih,
6039 *                        const intptr_t x, const intptr_t y,
6040 *                        pixel *dst, const ptrdiff_t dst_stride,
6041 *                        const pixel *ref, const ptrdiff_t ref_stride)
6042 */
6043function emu_edge_8bpc_lsx
6044    vxor.v          vr23,   vr23,    vr23   // zero
6045    addi.d          t0,     a3,      -1     // ih - 1
6046    addi.d          t1,     a2,      -1     // iw - 1
6047    vreplgr2vr.w    vr22,   t0
6048    vinsgr2vr.w     vr22,   t1,        1
6049    vreplgr2vr.w    vr0,    a5
6050    vinsgr2vr.w     vr0,    a4,        1     // [0] - h | [1] - w
6051
6052    vclip.w         vr2,    vr0,      vr23,    vr22
6053    vpickve2gr.w    t0,     vr2,      0
6054    ld.d            t2,     sp,       0
6055    ld.d            t8,     sp,       8     // ref_stride
6056    mul.w           t0,     t0,       t8
6057    vpickve2gr.w    t1,     vr2,      1
6058    add.d           t2,     t2,       t1
6059    add.d           t7,     t0,       t2    // ref
6060
6061    addi.d          t0,     a0,       -1     // bw - 1
6062    addi.d          t1,     a1,       -1     // bh - 1
6063    vreplgr2vr.w    vr21,   t0
6064    vreplgr2vr.w    vr22,   t1
6065    vilvl.d         vr21,   vr22,      vr21
6066    sub.d           t2,     zero,      a4    // -x
6067    add.d           t3,     a0,        a4
6068    sub.d           t3,     t3,        a2    // x + bw - iw
6069    sub.d           t4,     zero,      a5    // -y
6070    add.d           t5,     a1,        a5
6071    sub.d           t5,     t5,        a3    // y + bh - ih
6072    vreplgr2vr.w    vr0,    t2
6073    vinsgr2vr.w     vr0,    t3,        1
6074    vinsgr2vr.w     vr0,    t4,        2
6075    vinsgr2vr.w     vr0,    t5,        3
6076    vclip.w         vr2,    vr0,       vr23,    vr21
6077    vpickve2gr.w    t0,     vr2,       0     // left_ext
6078    vpickve2gr.w    t1,     vr2,       1     // right_ext
6079    vpickve2gr.w    t2,     vr2,       2     // top_ext
6080    vpickve2gr.w    t3,     vr2,       3     // bottom_ext
6081
6082    mul.w           t6,     t2,        a7
6083    add.d           t4,     t0,        t1
6084    add.d           t5,     t2,        t3
6085    sub.d           t4,     a0,        t4    // center_w
6086    sub.d           t5,     a1,        t5    // center_h
6087
6088    addi.d          a1,     zero,      16
6089    addi.d          a2,     zero,      8
6090    addi.d          a3,     zero,      4
6091    add.d           t6,     t6,        a6    // blk
6092
6093    beqz            t0,     2f
6094    // need_left
6095    beqz            t1,     3f
6096    // need_left + need_right
6097    DEGE_LOOP       1,   1
6098    b               5f
6099
61002:
6101    // !need_left
6102    beqz            t1,     4f
6103    // !need_left + need_right
6104    DEGE_LOOP       0,   1
6105    b               5f
6106
61073:
6108    // need_left + !need_right
6109    DEGE_LOOP       1,   0
6110    b               5f
6111
61124:
6113    // !need_left + !need_right
6114    DEGE_LOOP       0,   0
6115
61165:
6117    vpickve2gr.w    t2,     vr2,       2     // top_ext
6118    vpickve2gr.w    t3,     vr2,       3     // bottom_ext
6119    sub.d           t7,     a7,        a0    // dst_stride - bw
6120    mul.w           t8,     t2,        a7
6121
6122    beqz            t3,     2f
6123    // need_bottom
6124    sub.d           t0,     t6,        a7    //  &dst[-PXSTRIDE(dst_stride)]
61251:
6126    addi.d          t1,     t0,        0
6127    addi.d          a5,     a0,        0
6128    PIXEL_COPY_LSX t6, t1, a5
6129    add.d           t6,     t6,        t7
6130    addi.d          t3,     t3,   -1
6131    bnez            t3,     1b
61322:
6133    beqz            t2,     3f
6134    // need_top
6135    add.d           t8,     t8,        a6    // blk
61361:
6137    addi.d          t1,     t8,        0
6138    addi.d          a5,     a0,        0
6139    PIXEL_COPY_LSX a6, t1, a5
6140    add.d           a6,     a6,        t7
6141    addi.d          t2,     t2,   -1
6142    bnez            t2,     1b
61433:
6144
6145endfunc
6146