xref: /aosp_15_r20/external/libdav1d/src/riscv/64/mc.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/******************************************************************************
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2024, Nathan Egge, Niklas Haas, Bogdan Gligorijevic
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28#include "src/riscv/asm.S"
29
30function blend_vl256_8bpc_rvv, export=1, ext=zbb
31  ctz t0, a3
32  addi t0, t0, 0xc3
33  j L(blend_epilog)
34endfunc
35
36function blend_8bpc_rvv, export=1, ext="v,zbb"
37  ctz t0, a3
38  addi t0, t0, 0xc4
39L(blend_epilog):
40  csrw vxrm, zero
41  andi t0, t0, 0xc7
42  vsetvl zero, a3, t0
43  li t1, 64
441:
45  addi a4, a4, -2
46  vle8.v v4, (a2)
47  add a2, a2, a3
48  vle8.v v6, (a2)
49  add a2, a2, a3
50  vle8.v v8, (a5)
51  add a5, a5, a3
52  vle8.v v10, (a5)
53  add a5, a5, a3
54  vle8.v v0, (a0)
55  add t0, a0, a1
56  vle8.v v2, (t0)
57  vwmulu.vv v16, v4, v8
58  vwmulu.vv v20, v6, v10
59  vrsub.vx v8, v8, t1
60  vrsub.vx v10, v10, t1
61  vwmaccu.vv v16, v0, v8
62  vwmaccu.vv v20, v2, v10
63  vnclipu.wi v0, v16, 6
64  vnclipu.wi v2, v20, 6
65  vse8.v v0, (a0)
66  vse8.v v2, (t0)
67  add a0, t0, a1
68  bnez a4, 1b
69  ret
70endfunc
71
72function blend_h_vl256_8bpc_rvv, export=1, ext=zbb
73  srai t0, a3, 2
74  li t2, 64
75  ctz t0, t0
76  addi t0, t0, 0xc5
77  j L(blend_h_epilog)
78endfunc
79
80function blend_h_8bpc_rvv, export=1, ext="v,zbb"
81  li t2, 64
82  bgt a3, t2, 128f
83  ctz t0, a3
84  addi t0, t0, 0xc4
85L(blend_h_epilog):
86  csrw vxrm, zero
87  andi t0, t0, 0xc7
88  vsetvl zero, a3, t0
89  la t1, dav1d_obmc_masks
90  srai t0, a4, 2
91  add t1, t1, a4
92  sub a4, a4, t0
930:
94  mv t5, ra
951:
96  addi a4, a4, -2
97  lbu t3, (t1)
98  addi t1, t1, 1
99  lbu t4, (t1)
100  addi t1, t1, 1
101  vle8.v v8, (a2)
102  add a2, a2, a3
103  vle8.v v12, (a2)
104  add a2, a2, a3
105  vle8.v v0, (a0)
106  add t0, a0, a1
107  vle8.v v4, (t0)
108  vwmulu.vx v16, v8, t3
109  vwmulu.vx v24, v12, t4
110  sub t3, t2, t3
111  sub t4, t2, t4
112  vwmaccu.vx v16, t3, v0
113  vwmaccu.vx v24, t4, v4
114  vnclipu.wi v0, v16, 6
115  vnclipu.wi v4, v24, 6
116  vse8.v v0, (a0)
117  vse8.v v4, (t0)
118  add a0, t0, a1
119  bgtz a4, 1b
120  jr t5
121128:
122  csrw vxrm, zero
123  vsetvli zero, t2, e8, m4, ta, ma
124  la t1, dav1d_obmc_masks
125  srai t0, a4, 2
126  add t1, t1, a4
127  sub a4, a4, t0
128  mv a5, a0
129  mv a6, a2
130  mv a7, a4
131  jal t5, 1b
132  add t1, t1, a4
133  add a0, a5, t2
134  add a2, a6, t2
135  mv a4, a7
136  sub t1, t1, a4
137  j 0b
138endfunc
139
140function blend_v_vl256_8bpc_rvv, export=1, ext=zbb
141  srai t0, a3, 2
142  ctz t0, t0
143  addi t0, t0, 0xc5
144  j L(blend_v_epilog)
145endfunc
146
147function blend_v_8bpc_rvv, export=1, ext="v,zbb"
148  ctz t0, a3
149  addi t0, t0, 0xc4
150L(blend_v_epilog):
151  andi t0, t0, 0xc7
152  vsetvl zero, a3, t0
153  csrw vxrm, zero
154  la t1, dav1d_obmc_masks
155  add t1, t1, a3
156  vle8.v v8, (t1)
157  li t0, 64
158  vrsub.vx v10, v8, t0
1591:
160  addi a4, a4, -2
161  vle8.v v4, (a2)
162  add a2, a2, a3
163  vle8.v v6, (a2)
164  add a2, a2, a3
165  vle8.v v0, (a0)
166  add t0, a0, a1
167  vle8.v v2, (t0)
168  vwmulu.vv v12, v4, v8
169  vwmulu.vv v16, v6, v8
170  vwmaccu.vv v12, v0, v10
171  vwmaccu.vv v16, v2, v10
172  vnclipu.wi v0, v12, 6
173  vnclipu.wi v2, v16, 6
174  vse8.v v0, (a0)
175  vse8.v v2, (t0)
176  add a0, t0, a1
177  bnez a4, 1b
178  ret
179endfunc
180
181.macro avg va, vb, vm
182    vadd.vv \va, \va, \vb
183.endm
184
185.macro w_avg va, vb, vm
186    vwmul.vx v24, \va, a6
187    vwmacc.vx v24, a7, \vb
188    vnclip.wi \va, v24, 8
189.endm
190
191.macro mask va, vb, vm
192    vwmul.vv v24, \va, \vm
193    vrsub.vx \vm, \vm, a7
194    vwmacc.vv v24, \vb, \vm
195    vnclip.wi \va, v24, 10
196.endm
197
198.macro bidir_fn type, shift
199function \type\()_8bpc_rvv, export=1, ext="v,zba,zbb"
200.ifc \type, w_avg
201    li a7, 16
202    sub a7, a7, a6
203.endif
204.ifc \type, mask
205    li a7, 64
206.endif
207    li t0, 4
208    csrw vxrm, zero
209    beq t0, a4, 4f
210    csrr t0, vlenb
211    ctz t1, a4
212    ctz t0, t0
213    li t2, 1
214    sub t0, t1, t0
215    li t4, -3
216    bgt t0, t2, 2f
217    max t0, t0, t4
218    andi t1, t0, 0x7
219    addi t0, t1, 1 # may overflow into E16 bit
220    ori t0, t0, MA | TA | E16
221    ori t1, t1, MA | TA | E8
2221:
223    addi a5, a5, -4
224.rept 2
225    vsetvl zero, a4, t0
226    sh1add t3, a4, a2
227    vle16.v v0, (a2)
228    sh1add a2, a4, t3
229    vle16.v v4, (t3)
230    sh1add t3, a4, a3
231    vle16.v v8, (a3)
232    sh1add a3, a4, t3
233    vle16.v v12, (t3)
234.ifc \type, mask
235    add t3, a4, a6
236    vle8.v v24, (a6)
237    add a6, a4, t3
238    vle8.v v26, (t3)
239    vzext.vf2 v16, v24
240    vzext.vf2 v20, v26
241.endif
242    \type v0, v8, v16
243    \type v4, v12, v20
244    vmax.vx v8, v0, zero
245    vmax.vx v12, v4, zero
246    vsetvl zero, zero, t1
247    vnclipu.wi v0, v8,  \shift
248    vnclipu.wi v2, v12, \shift
249    add t3, a1, a0
250    vse8.v v0, (a0)
251    add a0, a1, t3
252    vse8.v v2, (t3)
253.endr
254    bnez a5, 1b
255    ret
2562:
257    mv t0, a0
258    neg t4, a4
259    add a0, a1, a0
260    addi a5, a5, -1
26120:
262    vsetvli t2, a4, e16, m4, ta, ma
263    sh1add t4, t2, t4
264    sh1add t3, t2, a2
265    vle16.v v0, (a2)
266    sh1add a2, t2, t3
267    vle16.v v4, (t3)
268    sh1add t3, t2, a3
269    vle16.v v8, (a3)
270    sh1add a3, t2, t3
271    vle16.v v12, (t3)
272.ifc \type, mask
273    add t3, t2, a6
274    vle8.v v24, (a6)
275    add a6, t2, t3
276    vle8.v v26, (t3)
277    vzext.vf2 v16, v24
278    vzext.vf2 v20, v26
279.endif
280    \type v0, v8, v16
281    \type v4, v12, v20
282    vmax.vx v8, v0, zero
283    vmax.vx v12, v4, zero
284    vsetvli zero, zero, e8, m2, ta, ma
285    vnclipu.wi v0, v8,  \shift
286    vnclipu.wi v2, v12, \shift
287    add t3, t2, t0
288    vse8.v v0, (t0)
289    add t0, t2, t3
290    vse8.v v2, (t3)
291    bnez t4, 20b
292    bnez a5, 2b
293    ret
2944:
295    slli t0, a5, 2
296    vsetvli t1, t0, e16, m4, ta, ma
297    vle16.v v0, (a2)
298    sh1add a2, t1, a2
299    vle16.v v4, (a3)
300    sh1add a3, t1, a3
301.ifc \type, mask
302    vle8.v v16, (a6)
303    add a6, t1, a6
304    vzext.vf2 v8, v16
305.endif
306    \type v0, v4, v8
307    vmax.vx v8, v0, zero
308    vsetvli zero, zero, e8, m2, ta, ma
309    vnclipu.wi v0, v8, \shift
310    vsetvli t1, a5, e32, m2, ta, ma
311    vsse32.v v0, (a0), a1
312    ctz t0, t1
313    sub a5, a5, t1
314    sll t0, a1, t0
315    add a0, t0, a0
316    bnez a5, 4b
317    ret
318endfunc
319.endm
320
321bidir_fn avg,   5
322bidir_fn w_avg, 0
323bidir_fn mask,  0
324
325function warp_8x8_8bpc_rvv, export=1, ext="v"
326    csrw vxrm, zero
327
328    vsetivli zero, 8, e16, m1, ta, ma
329    addi sp, sp, -2*15*8
330    mv t5, sp
331    li t0, 3
332    mul t0, a3, t0
333    sub a2, a2, t0
334    addi a2, a2, -3
335
336    li t0, 64
337    addi a3, a3, -8
338    li t1, 15
339    la t2, dav1d_mc_warp_filter
340
341    lh t6, (a4)
342    lh t4, 2(a4)
343    vid.v v30
344    vwmul.vx v28, v30, t6
3451:
346    addi t1, t1, -1
347
348
349    vsetvli zero, zero, e32, m2, ta, ma
350    vadd.vx v4, v28, a5
351    add a5, a5, t4
352    vssra.vi v2, v4, 10
353    vadd.vx v2, v2, t0
354    vsll.vi v24, v2, 3
355    vsetvli zero, zero, e8, mf2, ta, ma
356
357    vluxseg8ei32.v v2, (t2), v24
358
359    vsetvli zero, zero, e16, m1, ta, ma
360.irp i, 2, 3, 4, 5, 6, 7, 8, 9
361    vle8.v v10, (a2)
362    addi a2, a2, 1
363
364    vsext.vf2 v14, v\i
365    vzext.vf2 v16, v10
366
367.if \i == 2
368    vwmulsu.vv v12, v14, v16
369.else
370    vwmaccsu.vv v12, v14, v16
371.endif
372.endr
373    vnclip.wi v10, v12, 3
374
375    add a2, a2, a3
376    vse16.v v10, (t5)
377    addi t5, t5, 16
378
379    bnez t1, 1b
380
381    mv t5, sp
382    li t1, 8
383
384    lh t6, 4(a4)
385    lh t4, 6(a4)
386    vwmul.vx v28, v30, t6
3872:
388    addi t1, t1, -1
389
390    vsetvli zero, zero, e32, m2, ta, ma
391    vadd.vx v4, v28, a6
392
393    add a6, a6, t4
394    vssra.vi v2, v4, 10
395    vadd.vx v2, v2, t0
396    vsll.vi v24, v2, 3
397    vsetvli zero, zero, e8, mf2, ta, ma
398
399    vluxseg8ei32.v v2, (t2), v24
400    vsetvli zero, zero, e16, m1, ta, ma
401
402.irp i, 2, 3, 4, 5, 6, 7, 8, 9
403    vle16.v v10, (t5)
404    addi t5, t5, 16
405
406    vsext.vf2 v14, v\i
407
408.if \i == 2
409    vwmul.vv v12, v14, v10
410.else
411    vwmacc.vv v12, v14, v10
412.endif
413.endr
414    addi t5, t5, -16*7
415    vnclip.wi v10, v12, 11
416
417    vmax.vx v10, v10, zero
418    vsetvli zero, zero, e8, mf2, ta, ma
419
420    vnclipu.wi v12, v10, 0
421
422    vse8.v v12, (a0)
423    add a0, a0, a1
424
425    bnez t1, 2b
426
427    addi sp, sp, 2*15*8
428
429    ret
430endfunc
431
432function warp_8x8t_8bpc_rvv, export=1, ext="v,zba"
433    csrw vxrm, zero
434
435    vsetivli zero, 8, e16, m1, ta, ma
436    addi sp, sp, -2*15*8
437    mv t5, sp
438    li t0, 3
439    mul t0, a3, t0
440    sub a2, a2, t0
441    addi a2, a2, -3
442
443    li t0, 64
444    addi a3, a3, -8
445    li t1, 15
446    la t2, dav1d_mc_warp_filter
447
448    lh t6, (a4)
449    lh t4, 2(a4)
450    vid.v v30
451    vwmul.vx v28, v30, t6
4521:
453    addi t1, t1, -1
454
455
456    vsetvli zero, zero, e32, m2, ta, ma
457    vadd.vx v4, v28, a5
458    add a5, a5, t4
459    vssra.vi v2, v4, 10
460    vadd.vx v2, v2, t0
461    vsll.vi v24, v2, 3
462    vsetvli zero, zero, e8, mf2, ta, ma
463
464    vluxseg8ei32.v v2, (t2), v24
465
466    vsetvli zero, zero, e16, m1, ta, ma
467.irp i, 2, 3, 4, 5, 6, 7, 8, 9
468    vle8.v v10, (a2)
469    addi a2, a2, 1
470
471    vsext.vf2 v14, v\i
472    vzext.vf2 v16, v10
473
474.if \i == 2
475    vwmulsu.vv v12, v14, v16
476.else
477    vwmaccsu.vv v12, v14, v16
478.endif
479.endr
480    vnclip.wi v10, v12, 3
481
482    add a2, a2, a3
483    vse16.v v10, (t5)
484    addi t5, t5, 16
485
486    bnez t1, 1b
487
488    mv t5, sp
489    li t1, 8
490
491    lh t6, 4(a4)
492    lh t4, 6(a4)
493    vwmul.vx v28, v30, t6
4942:
495    addi t1, t1, -1
496
497    vsetvli zero, zero, e32, m2, ta, ma
498    vadd.vx v4, v28, a6
499    add a6, a6, t4
500    vssra.vi v2, v4, 10
501    vadd.vx v2, v2, t0
502    vsll.vi v24, v2, 3
503    vsetvli zero, zero, e8, mf2, ta, ma
504
505    vluxseg8ei32.v v2, (t2), v24
506    vsetvli zero, zero, e16, m1, ta, ma
507
508.irp i, 2, 3, 4, 5, 6, 7, 8, 9
509    vle16.v v10, (t5)
510    addi t5, t5, 16
511
512    vsext.vf2 v14, v\i
513
514.if \i == 2
515    vwmul.vv v12, v14, v10
516.else
517    vwmacc.vv v12, v14, v10
518.endif
519
520.endr
521    addi t5, t5, -16*7
522    vnclip.wi v10, v12, 7
523
524    vse16.v v10, (a0)
525    sh1add a0, a1, a0
526
527    bnez t1, 2b
528
529    addi sp, sp, 2*15*8
530
531    ret
532endfunc
533