xref: /aosp_15_r20/external/libdav1d/src/riscv/64/cdef.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/******************************************************************************
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2024, Bogdan Gligorijevic
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28#include "src/riscv/asm.S"
29
30.macro constrain_vectors vec1, vec2, vec_sub, strength, shift, vec_tmp1, vec_tmp2
31    vmslt.vx v0, \vec_tmp1, zero
32    vneg.v \vec_tmp1, \vec_tmp1, v0.t
33    vmmv.m v1, v0
34
35    vmslt.vx v0, \vec_tmp2, zero
36    vneg.v \vec_tmp2, \vec_tmp2, v0.t
37
38    vsra.vx \vec1, \vec_tmp1, \shift
39    vsra.vx \vec2, \vec_tmp2, \shift
40
41    vrsub.vx \vec1, \vec1, \strength
42    vrsub.vx \vec2, \vec2, \strength
43
44    vmax.vx \vec1, \vec1, zero
45    vmax.vx \vec2, \vec2, zero
46
47    vmin.vv \vec_tmp1, \vec1, \vec_tmp1
48    vmin.vv \vec_tmp2, \vec2, \vec_tmp2
49
50    vneg.v \vec_tmp2, \vec_tmp2, v0.t
51
52    vmmv.m v0, v1
53    vneg.v \vec_tmp1, \vec_tmp1, v0.t
54.endm
55
56.macro padding_fn w, h
57    li t5, -32768 # INT16_MIN
58
59    andi t4, a7, 4
60    li t2, -2 # y_start
61
62.if \w == 4
63    vsetivli zero, \w + 4, e16, m1, ta, ma
64.else
65    vsetivli zero, \w + 4, e16, m2, ta, ma
66.endif
67    vmv.v.x v0, t5
68    bnez t4, L(top_done_\w\()x\h)
69
70    slli t5, a1, 1
71    addi t5, t5, 2
72    slli t5, t5, 1
73    sub t5, a0, t5
74
75    sh1add t4, a1, t5
76    vse16.v v0, (t5)
77    vse16.v v0, (t4)
78    li t2, 0
79
80L(top_done_\w\()x\h):
81    andi t4, a7, 8
82    li t3, 2 + \h # y_end
83    bnez t4, L(bottom_done_\w\()x\h)
84
85    li t5, \h
86    mul t5, a1, t5
87    addi t5, t5, -2
88    sh1add t5, t5, a0
89
90    sh1add t4, a1, t5
91    vse16.v v0, (t5)
92    vse16.v v0, (t4)
93    addi t3, t3, -2
94
95L(bottom_done_\w\()x\h):
96    andi t4, a7, 1
97    li t0, -2 # x_start
98
99.if \w == 4
100    vsetivli zero, 2, e16, m1, ta, ma
101.else
102    vsetivli zero, 2, e16, m2, ta, ma
103.endif
104
105    bnez t4, L(left_done_\w\()x\h)
106
107    mul t5, a1, t2
108    addi t5, t5, -2
109    sh1add t5, t5, a0
110
111    sub t0, t3, t2
112
1133:
114    vse16.v v0, (t5)
115    sh1add t5, a1, t5
116    addi t0, t0, -1
117    bnez t0, 3b
118
119L(left_done_\w\()x\h):
120
121    andi t4, a7, 2
122    li t1, 2 + \w # x_end
123    bnez t4, L(right_done_\w\()x\h)
124
125    mul t5, t2, a1
126    addi t5, t5, \w
127    sh1add t5, t5, a0
128
129    sub t1, t3, t2
130
1314:
132    vse16.v v0, (t5)
133    sh1add t5, a1, t5
134    addi t1, t1, -1
135    bnez t1, 4b
136
137    li t1, \w
138
139L(right_done_\w\()x\h):
140
141    beqz t2, L(top_skip_\w\()x\h)
142
143    mul t5, a1, t2
144    add t5, t0, t5
145    sh1add a0, t5, a0 # tmp += y_start * tmp_stride + x_start
146    add a5, a5, t0
147
148    sub t5, t1, t0 # x_end - x_start
149    slli t6, t0, 1
150.if \w == 4
151    vsetvli zero, t5, e16, m1, ta, ma
152.else
153    vsetvli zero, t5, e16, m2, ta, ma
154.endif
155
1565:
157    vle8.v v0, (a5)
158    addi t2, t2, 1
159    vzext.vf2 v2, v0
160    add a5, a3, a5
161    vse16.v v2, (a0)
162    sh1add a0, a1, a0
163    bnez t2, 5b
164
165    sub a0, a0, t6 # tmp -= x_start
166
167L(top_skip_\w\()x\h):
168
169    li a5, \h
170    beqz t0, L(left_skip_\w\()x\h)
171
172    sh1add a0, t0, a0 # tmp += x_start
173
1747:
175.if \w == 4
176    vsetivli zero, 2, e16, m1, ta, ma
177.else
178    vsetivli zero, 2, e16, m2, ta, ma
179.endif
180
181    vle8.v v0, (a4)
182    addi a5, a5, -1
183    vzext.vf2 v2, v0
184    addi a4, a4, 2
185    vse16.v v2, (a0)
186    sh1add a0, a1, a0
187    bnez a5, 7b
188
189    li a5, \h
190    mul t5, a1, a5
191    add t5, t5, t0
192    slli t5, t5, 1
193    sub a0, a0, t5 # tmp -= h * tmp_stride + x_start
194
195L(left_skip_\w\()x\h):
196
1978:
198.if \w == 4
199    vsetvli zero, t1, e16, m1, ta, ma
200.else
201    vsetvli zero, t1, e16, m2, ta, ma
202.endif
203
204    vle8.v v0, (a2)
205    vzext.vf2 v2, v0
206    vse16.v v2, (a0)
207    add a2, a3, a2
208    sh1add a0, a1, a0
209    addi a5, a5, -1
210    bnez a5, 8b
211
212
213    li a5, \h
214    sh1add a0, t0, a0 # tmp += x_start
215    add a6, a6, t0 # bottom += x_start
216    beq a5, t3, L(bottom_skip_\w\()x\h)
217
218    sub t5, t1, t0
219.if \w == 4
220    vsetvli zero, t5, e16, m1, ta, ma
221.else
222    vsetvli zero, t5, e16, m2, ta, ma
223.endif
224
2259:
226    vle8.v v0, (a6)
227    add a6, a3, a6
228    vzext.vf2 v2, v0
229    addi a5, a5, 1
230    vse16.v v2, (a0)
231    sh1add a0, a1, a0
232    bne a5, t3, 9b
233
234L(bottom_skip_\w\()x\h):
235    li t6, \h
236    mul t6, a3, t6
237    sub a2, a2, t6 # src -= h * src_stride
238    mul t5, a1, t3
239    add t5, t5, t0
240    slli t5, t5, 1
241    sub a0, a0, t5 # tmp -= y_end * tmp_stride + x_start
242.endm
243
244
245.macro cdef_fn w, h
246function cdef_filter_block_\w\()x\h\()_8bpc_rvv, export=1, ext="v,zba,zbb"
247    csrw vxrm, zero
248
249    addi sp, sp, -32 - 144*2
250    sd a5, 24(sp) # pri_strength
251    sd a6, 16(sp) # sec_strength
252    sd a7, 8(sp) # dir
253
254
255    ld a7, 8 + 32 + 144*2(sp) # edges
256    mv a6, a4 # bottom
257    mv a5, a3 # top
258    mv a4, a2 # left
259    mv a3, a1 # dst_stride
260    mv a2, a0 # dst
261    li a1, 12 # tmp_stride
262    addi a0, sp, 32 + 2*(2*12+2)
263    padding_fn \w, \h
264
265    ld a4, 32 + 2*144(sp) # damping
266    ld a5, 24(sp) # pri_strength
267    ld a6, 16(sp) # sec_strength
268    ld a7, 8(sp) # dir
269
270    beqz a5, cdef_filter_sec_only_\w\()x\h
271
272    bnez a6, cdef_filter_pri_sec_\w\()x\h
273
274    andi t0, a5, 1
275    li t1, 4
276    sub t4, t1, t0
277
278    li t1, 63
279    clz t2, a5
280    sub t1, t1, t2
281    sub t1, a4, t1
282
283    li t0, \h
284
285    la t2, dav1d_cdef_directions
286    addi t3, a7, 2
287    sh1add t2, t3, t2
288
289    blt zero, t1, 1f
290    mv t1, zero
2911:
292    vsetivli zero, \w, e16, m1, ta, mu
293
294    lb t3, 0(t2)
295
296    vle8.v v0, (a2)
297    vzext.vf2 v2, v0
298
299    sh1add t6, t3, a0
300    slli t3, t3, 1
301    sub t3, a0, t3
302
303    vle16.v v4, (t6)
304    vle16.v v6, (t3)
305
306    vwsub.vv v8, v4, v2
307    vwsub.vv v16, v6, v2
308
309    vsetvli zero, zero, e32, m2, ta, mu
310
311    constrain_vectors v4, v6, v12, a5, t1, v8, v16
312
313    vmul.vx v28, v16, t4
314    vmacc.vx v28, t4, v8
315
316    lb t3, 1(t2)
317
318    andi t5, t4, 3
319    ori t5, t5, 2
320
321    sh1add t6, t3, a0
322    slli t3, t3, 1
323    sub t3, a0, t3
324
325    vsetvli zero, zero, e16, m1, ta, mu
326
327    vle16.v v4, (t6)
328    vle16.v v6, (t3)
329
330    vwsub.vv v8, v4, v2
331    vwsub.vv v16, v6, v2
332
333    vsetvli zero, zero, e32, m2, ta, mu
334
335    constrain_vectors v4, v6, v12, a5, t1, v8, v16
336
337    vmacc.vx v28, t5, v16
338    vmacc.vx v28, t5, v8
339
340    vmslt.vx v0, v28, zero
341    vadd.vi v28, v28, -1, v0.t
342
343    vsetvli zero, zero, e16, m1, ta, ma
344
345    vnclip.wi v24, v28, 4
346
347    vadd.vv v28, v2, v24
348
349    vsetvli zero, zero, e8, mf2, ta, ma
350
351    vnclipu.wi v24, v28, 0
352
353    vse8.v v24, (a2)
354
355    addi t0, t0, -1
356    add a2, a2, a3
357    sh1add a0, a1, a0
358
359    bnez t0, 1b
360
361    addi sp, sp, 32 + 144*2
362    ret
363
364cdef_filter_sec_only_\w\()x\h:
365    li t1, 63
366    clz t2, a6
367    sub t1, t1, t2
368    sub t1, a4, t1
369
370    li t0, \h
371
372    la t2, dav1d_cdef_directions
373    addi t3, a7, 4
374    sh1add t3, t3, t2
375    sh1add t2, a7, t2
376
3772:
378    vsetivli zero, \w, e16, m1, ta, mu
379
380    lb t4, 0(t3)
381    lb t5, 0(t2)
382
383    vle8.v v0, (a2)
384    vzext.vf2 v2, v0
385
386    sh1add t6, t4, a0
387    slli t4, t4, 1
388    sub t4, a0, t4
389
390    vle16.v v4, (t6)
391    vle16.v v6, (t4)
392
393    sh1add t4, t5, a0
394    slli t5, t5, 1
395    sub t5, a0, t5
396
397    vle16.v v8, (t4)
398    vle16.v v10, (t5)
399
400    vwsub.vv v12, v4, v2
401    vwsub.vv v14, v6, v2
402    vwsub.vv v16, v8, v2
403    vwsub.vv v18, v10, v2
404
405    vsetvli zero, zero, e32, m2, ta, mu
406
407    li t4, 2
408    constrain_vectors v4, v6, v12, a6, t1, v12, v14
409    constrain_vectors v8, v10, v14, a6, t1, v16, v18
410
411    vmul.vx v28, v18, t4
412    vmacc.vx v28, t4, v16
413    vmacc.vx v28, t4, v14
414    vmacc.vx v28, t4, v12
415
416
417    lb t4, 1(t3)
418    lb t5, 1(t2)
419
420    sh1add t6, t4, a0
421    slli t4, t4, 1
422    sub t4, a0, t4
423
424    vsetvli zero, zero, e16, m1, ta, mu
425
426    vle16.v v4, (t6)
427    vle16.v v6, (t4)
428
429    sh1add t4, t5, a0
430    slli t5, t5, 1
431    sub t5, a0, t5
432
433    vle16.v v8, (t4)
434    vle16.v v10, (t5)
435
436    vwsub.vv v12, v4, v2
437    vwsub.vv v14, v6, v2
438    vwsub.vv v16, v8, v2
439    vwsub.vv v18, v10, v2
440
441    vsetvli zero, zero, e32, m2, ta, mu
442
443    constrain_vectors v4, v6, v12, a6, t1, v12, v14
444    constrain_vectors v8, v10, v14, a6, t1, v16, v18
445
446    vadd.vv v4, v28, v12
447    vadd.vv v28, v4, v14
448    vadd.vv v4, v28, v16
449    vadd.vv v28, v4, v18
450
451    vmslt.vx v0, v28, zero
452    vadd.vi v28, v28, -1, v0.t
453
454    vsetvli zero, zero, e16, m1, ta, ma
455
456    vnclip.wi v24, v28, 4
457
458    vadd.vv v28, v2, v24
459
460    vsetvli zero, zero, e8, mf2, ta, ma
461
462    vnclipu.wi v24, v28, 0
463
464    vse8.v v24, (a2)
465
466    addi t0, t0, -1
467    add a2, a2, a3
468    sh1add a0, a1, a0
469
470    bnez t0, 2b
471
472    addi sp, sp, 32 + 144*2
473    ret
474cdef_filter_pri_sec_\w\()x\h:
475
476    li t1, 63
477    clz t2, a5
478    clz t3, a6
479    sub t2, t1, t2
480    sub t3, t1, t3
481    sub t1, a4, t2
482    sub t2, a4, t3
483
484    li t0, \h
485
486    la t3, dav1d_cdef_directions
487
488    blt zero, t1, 3f
489    mv t1, zero
4903:
491    vsetivli zero, \w, e16, m1, ta, ma
492
493    li t4, 4
494    andi t6, a5, 1
495    addi t5, a7, 2
496    sub t4, t4, t6
497
498    sh1add t5, t5, t3
499
500    vle8.v v0, (a2)
501
502    lb t6, 0(t5)
503
504    vzext.vf2 v2, v0
505
506    sh1add a4, t6, a0
507    slli t6, t6, 1
508    sub t6, a0, t6
509
510    vle16.v v4, (a4)
511    vle16.v v6, (t6)
512
513    vminu.vv v20, v4, v2
514    vmax.vv v24, v4, v2
515    vminu.vv v20, v6, v20
516    vmax.vv v24, v6, v24
517
518    vwsub.vv v8, v4, v2
519    vwsub.vv v16, v6, v2
520
521    vsetvli zero, zero, e32, m2, ta, mu
522
523    constrain_vectors v4, v6, v12, a5, t1, v8, v16
524
525    vmul.vx v28, v16, t4
526    vmacc.vx v28, t4, v8
527
528    lb t6, 1(t5)
529
530    andi t4, t4, 3
531    ori t4, t4, 2
532
533
534    sh1add a4, t6, a0
535    slli t6, t6, 1
536    sub t6, a0, t6
537
538    vsetvli zero, zero, e16, m1, ta, ma
539
540    vle16.v v4, (a4)
541    vle16.v v6, (t6)
542
543    vminu.vv v20, v4, v20
544    vmax.vv v24, v4, v24
545    vminu.vv v20, v6, v20
546    vmax.vv v24, v6, v24
547
548    vwsub.vv v8, v4, v2
549    vwsub.vv v16, v6, v2
550
551    vsetvli zero, zero, e32, m2, ta, mu
552
553    constrain_vectors v4, v6, v12, a5, t1, v8, v16
554
555    addi t5, a7, 4
556    vmacc.vx v28, t4, v16
557    vmacc.vx v28, t4, v8
558
559    sh1add t5, t5, t3
560
561    lb t6, 0(t5)
562
563    sh1add a4, t6, a0
564    slli t6, t6, 1
565    sub t6, a0, t6
566
567    vsetvli zero, zero, e16, m1, ta, ma
568
569    vle16.v v4, (a4)
570    vle16.v v6, (t6)
571
572    vminu.vv v20, v4, v20
573    vmax.vv v24, v4, v24
574    vminu.vv v20, v6, v20
575    vmax.vv v24, v6, v24
576
577    vwsub.vv v8, v4, v2
578    vwsub.vv v16, v6, v2
579
580    vsetvli zero, zero, e32, m2, ta, mu
581
582    li t6, 2
583    constrain_vectors v4, v6, v12, a6, t2, v8, v16
584
585    vmacc.vx v28, t6, v16
586    vmacc.vx v28, t6, v8
587
588    lb t6, 1(t5)
589
590    sh1add a4, t6, a0
591    slli t6, t6, 1
592    sub t6, a0, t6
593
594    vsetvli zero, zero, e16, m1, ta, ma
595
596    vle16.v v4, (a4)
597    vle16.v v6, (t6)
598
599    vminu.vv v20, v4, v20
600    vmax.vv v24, v4, v24
601    vminu.vv v20, v6, v20
602    vmax.vv v24, v6, v24
603
604    vwsub.vv v8, v4, v2
605    vwsub.vv v16, v6, v2
606
607    vsetvli zero, zero, e32, m2, ta, mu
608
609    constrain_vectors v4, v6, v12, a6, t2, v8, v16
610
611    sh1add t5, a7, t3
612
613    vadd.vv v4, v28, v8
614    vadd.vv v28, v4, v16
615
616    vsetvli zero, zero, e16, m1, ta, ma
617
618    lb t6, 0(t5)
619
620    sh1add a4, t6, a0
621    slli t6, t6, 1
622    sub t6, a0, t6
623
624    vle16.v v4, (a4)
625    vle16.v v6, (t6)
626
627    vminu.vv v20, v4, v20
628    vmax.vv v24, v4, v24
629    vminu.vv v20, v6, v20
630    vmax.vv v24, v6, v24
631
632    vwsub.vv v8, v4, v2
633    vwsub.vv v16, v6, v2
634
635    vsetvli zero, zero, e32, m2, ta, mu
636
637    li t6, 2
638    constrain_vectors v4, v6, v12, a6, t2, v8, v16
639
640    vmacc.vx v28, t6, v16
641    vmacc.vx v28, t6, v8
642
643    lb t6, 1(t5)
644
645    sh1add a4, t6, a0
646    slli t6, t6, 1
647    sub t6, a0, t6
648
649    vsetvli zero, zero, e16, m1, ta, ma
650
651    vle16.v v4, (a4)
652    vle16.v v6, (t6)
653
654    vminu.vv v20, v4, v20
655    vmax.vv v24, v4, v24
656    vminu.vv v20, v6, v20
657    vmax.vv v24, v6, v24
658
659    vwsub.vv v8, v4, v2
660    vwsub.vv v16, v6, v2
661
662    vsetvli zero, zero, e32, m2, ta, mu
663
664    constrain_vectors v4, v6, v12, a6, t2, v8, v16
665
666    vadd.vv v4, v28, v8
667    vadd.vv v28, v4, v16
668
669    vmslt.vx v0, v28, zero
670    vadd.vi v28, v28, -1, v0.t
671
672    vsetvli zero, zero, e16, m1, ta, mu
673
674    vnclip.wi v16, v28, 4
675
676    vadd.vv v28, v2, v16
677
678    vmslt.vv v0, v20, v28
679    vmerge.vvm v4, v20, v28, v0
680
681    vmslt.vv v0, v4, v24
682    vmerge.vvm v28, v24, v4, v0
683
684    vsetvli zero, zero, e8, mf2, ta, ma
685
686    vnclipu.wi v24, v28, 0
687
688    vse8.v v24, (a2)
689
690    addi t0, t0, -1
691    add a2, a2, a3
692    sh1add a0, a1, a0
693
694    bnez t0, 3b
695
696    addi sp, sp, 32 + 144*2
697    ret
698endfunc
699.endm
700
701cdef_fn 4, 4
702cdef_fn 4, 8
703cdef_fn 8, 8
704