xref: /aosp_15_r20/external/libdav1d/src/riscv/64/cdef16.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/******************************************************************************
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2024, Bogdan Gligorijevic
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28#include "src/riscv/asm.S"
29
30.macro constrain_vectors vec1, vec2, vec_sub, strength, shift, vec_tmp1, vec_tmp2
31    vmslt.vx v0, \vec_tmp1, zero
32    vneg.v \vec_tmp1, \vec_tmp1, v0.t
33    vmmv.m v1, v0
34
35    vmslt.vx v0, \vec_tmp2, zero
36    vneg.v \vec_tmp2, \vec_tmp2, v0.t
37
38    vsra.vx \vec1, \vec_tmp1, \shift
39    vsra.vx \vec2, \vec_tmp2, \shift
40
41    vrsub.vx \vec1, \vec1, \strength
42    vrsub.vx \vec2, \vec2, \strength
43
44    vmax.vx \vec1, \vec1, zero
45    vmax.vx \vec2, \vec2, zero
46
47    vmin.vv \vec_tmp1, \vec1, \vec_tmp1
48    vmin.vv \vec_tmp2, \vec2, \vec_tmp2
49
50    vneg.v \vec_tmp2, \vec_tmp2, v0.t
51
52    vmmv.m v0, v1
53    vneg.v \vec_tmp1, \vec_tmp1, v0.t
54.endm
55
56.macro padding_fn w, h
57    li t5, -32768 # INT16_MIN
58
59    andi t4, a7, 4
60    li t2, -2 # y_start
61
62.if \w == 4
63    vsetivli zero, \w + 4, e16, m1, ta, ma
64.else
65    vsetivli zero, \w + 4, e16, m2, ta, ma
66.endif
67    vmv.v.x v0, t5
68    bnez t4, L(top_done_\w\()x\h)
69
70    slli t5, a1, 1
71    addi t5, t5, 2
72    slli t5, t5, 1
73    sub t5, a0, t5
74
75    sh1add t4, a1, t5
76    vse16.v v0, (t5)
77    vse16.v v0, (t4)
78    li t2, 0
79
80L(top_done_\w\()x\h):
81    andi t4, a7, 8
82    li t3, 2 + \h # y_end
83    bnez t4, L(bottom_done_\w\()x\h)
84
85    li t5, \h
86    mul t5, a1, t5
87    addi t5, t5, -2
88    sh1add t5, t5, a0
89
90    sh1add t4, a1, t5
91    vse16.v v0, (t5)
92    vse16.v v0, (t4)
93    addi t3, t3, -2
94
95L(bottom_done_\w\()x\h):
96    andi t4, a7, 1
97    li t0, -2 # x_start
98
99.if \w == 4
100    vsetivli zero, 2, e16, m1, ta, ma
101.else
102    vsetivli zero, 2, e16, m2, ta, ma
103.endif
104
105    bnez t4, L(left_done_\w\()x\h)
106
107    mul t5, a1, t2
108    addi t5, t5, -2
109    sh1add t5, t5, a0
110
111    sub t0, t3, t2
112
1133:
114    vse16.v v0, (t5)
115    sh1add t5, a1, t5
116    addi t0, t0, -1
117    bnez t0, 3b
118
119L(left_done_\w\()x\h):
120
121    andi t4, a7, 2
122    li t1, 2 + \w # x_end
123    bnez t4, L(right_done_\w\()x\h)
124
125    mul t5, t2, a1
126    addi t5, t5, \w
127    sh1add t5, t5, a0
128
129    sub t1, t3, t2
130
1314:
132    vse16.v v0, (t5)
133    sh1add t5, a1, t5
134    addi t1, t1, -1
135    bnez t1, 4b
136
137    li t1, \w
138
139L(right_done_\w\()x\h):
140
141    beqz t2, L(top_skip_\w\()x\h)
142
143    mul t5, a1, t2
144    add t5, t0, t5
145    sh1add a0, t5, a0 # tmp += y_start * tmp_stride + x_start
146    sh1add a5, t0, a5 # top += x_start
147
148    sub t5, t1, t0
149    slli t6, t0, 1
150.if \w == 4
151    vsetvli zero, t5, e16, m1, ta, ma
152.else
153    vsetvli zero, t5, e16, m2, ta, ma
154.endif
155
1565:
157    vle16.v v2, (a5)
158    addi t2, t2, 1
159    add a5, a3, a5
160    vse16.v v2, (a0)
161    sh1add a0, a1, a0
162    bnez t2, 5b
163
164    sub a0, a0, t6 # tmp -= x_start
165
166L(top_skip_\w\()x\h):
167
168    li a5, \h
169    beqz t0, L(left_skip_\w\()x\h)
170
171    sh1add a0, t0, a0 # tmp += x_start
172
1737:
174.if \w == 4
175    vsetivli zero, 2, e16, m1, ta, ma
176.else
177    vsetivli zero, 2, e16, m2, ta, ma
178.endif
179
180    vle16.v v2, (a4)
181    addi a5, a5, -1
182    addi a4, a4, 4
183    vse16.v v2, (a0)
184    sh1add a0, a1, a0
185    bnez a5, 7b
186
187    li a5, \h
188    mul t5, a1, a5
189    add t5, t5, t0
190    slli t5, t5, 1
191    sub a0, a0, t5 # tmp -= h * tmp_stride + x_start
192
193L(left_skip_\w\()x\h):
194
1958:
196.if \w == 4
197    vsetvli zero, t1, e16, m1, ta, ma
198.else
199    vsetvli zero, t1, e16, m2, ta, ma
200.endif
201
202    vle16.v v2, (a2)
203    add a2, a3, a2
204    vse16.v v2, (a0)
205    sh1add a0, a1, a0
206    addi a5, a5, -1
207    bnez a5, 8b
208
209
210    li a5, \h
211    sh1add a0, t0, a0 # tmp += x_start
212    sh1add a6, t0, a6 # bottom += x_start
213    beq a5, t3, L(bottom_skip_\w\()x\h)
214
215    sub t5, t1, t0
216.if \w == 4
217    vsetvli zero, t5, e16, m1, ta, ma
218.else
219    vsetvli zero, t5, e16, m2, ta, ma
220.endif
221
2229:
223    vle16.v v2, (a6)
224    add a6, a3, a6
225    addi a5, a5, 1
226    vse16.v v2, (a0)
227    sh1add a0, a1, a0
228    bne a5, t3, 9b
229
230L(bottom_skip_\w\()x\h):
231    li t6, \h
232    mul t6, a3, t6
233    sub a2, a2, t6 # src -= h * PXSTRIDE(src_stride)
234    mul t5, a1, t3
235    add t5, t5, t0
236    slli t5, t5, 1
237    sub a0, a0, t5 # tmp -= y_end * tmp_stride + x_start
238.endm
239
240.macro cdef_fn w, h
241function cdef_filter_block_\w\()x\h\()_16bpc_rvv, export=1, ext="v,zba,zbb"
242    csrw vxrm, zero
243
244    addi sp, sp, -32 - 144*2
245    sd a5, 24(sp) # pri_strength
246    sd a6, 16(sp) # sec_strength
247    sd a7, 8(sp) # dir
248
249    ld a7, 8 + 32 + 144*2(sp) # edges
250    mv a6, a4 # bottom
251    mv a5, a3 # top
252    mv a4, a2 # left
253    mv a3, a1 # dst_stride
254    mv a2, a0 # dst
255    li a1, 12 # tmp_stride
256    addi a0, sp, 32 + 2*(2*12+2)
257
258    padding_fn \w, \h
259
260    ld a4, 32 + 2*144(sp) # damping
261    ld a5, 24(sp) # pri_strength
262    ld a6, 16(sp) # sec_strength
263    ld a7, 8(sp) # dir
264
265    beqz a5, cdef_filter_sec_only_\w\()x\h
266
267    bnez a6, cdef_filter_pri_sec_\w\()x\h
268
269    li t1, 64-8
270    ld t4, 32 + 2*144 + 16(sp) # bitdepth_max
271    clz t4, t4
272    sub t4, t1, t4
273    sra t4, a5, t4
274    andi t0, t4, 1
275    li t1, 4
276    sub t4, t1, t0
277
278    li t1, 63
279    clz t2, a5
280    sub t1, t1, t2
281    sub t1, a4, t1
282
283    li t0, \h
284
285    la t2, dav1d_cdef_directions
286    addi t3, a7, 2
287    sh1add t2, t3, t2
288
289    vsetivli zero, \w, e16, m1, ta, ma
290    blt zero, t1, 1f
291    mv t1, zero
2921:
293    lb t3, 0(t2)
294
295    vle16.v v2, (a2)
296
297    sh1add t6, t3, a0
298    slli t3, t3, 1
299    sub t3, a0, t3
300
301    vle16.v v4, (t6)
302    vle16.v v6, (t3)
303
304    vwsub.vv v8, v4, v2
305    vwsub.vv v16, v6, v2
306
307    vsetvli zero, zero, e32, m2, ta, mu
308
309    constrain_vectors v4, v6, v2, a5, t1, v8, v16
310
311    vmul.vx v28, v16, t4
312    vmacc.vx v28, t4, v8
313
314    lb t3, 1(t2)
315
316    andi t5, t4, 3
317    ori t5, t5, 2
318
319    sh1add t6, t3, a0
320    slli t3, t3, 1
321    sub t3, a0, t3
322
323    vsetvli zero, zero, e16, m1, ta, ma
324
325    vle16.v v4, (t6)
326    vle16.v v6, (t3)
327
328    vwsub.vv v8, v4, v2
329    vwsub.vv v16, v6, v2
330
331    vsetvli zero, zero, e32, m2, ta, mu
332
333    constrain_vectors v4, v6, v2, a5, t1, v8, v16
334
335    vmacc.vx v28, t5, v16
336    vmacc.vx v28, t5, v8
337
338    vmslt.vx v0, v28, zero
339    vadd.vi v28, v28, -1, v0.t
340
341    vsetvli zero, zero, e16, m1, ta, ma
342
343    vnclip.wi v24, v28, 4
344
345    vadd.vv v28, v2, v24
346
347    vse16.v v28, (a2)
348
349    add a2, a2, a3
350    sh1add a0, a1, a0
351
352    addi t0, t0, -1
353    bnez t0, 1b
354
355    addi sp, sp, 32 + 144*2
356    ret
357
358cdef_filter_sec_only_\w\()x\h:
359    li t1, 63
360    clz t2, a6
361    sub t1, t1, t2
362    sub t1, a4, t1
363
364    li t0, \h
365
366    la t2, dav1d_cdef_directions
367    addi t3, a7, 4
368    sh1add t3, t3, t2
369    sh1add t2, a7, t2
370
371    vsetivli zero, \w, e16, m1, ta, ma
3722:
373
374    lb t4, 0(t3)
375    lb t5, 0(t2)
376
377    vle16.v v2, (a2)
378
379    sh1add t6, t4, a0
380    slli t4, t4, 1
381    sub t4, a0, t4
382
383    vle16.v v4, (t6)
384    vle16.v v6, (t4)
385
386    sh1add t4, t5, a0
387    slli t5, t5, 1
388    sub t5, a0, t5
389
390    vle16.v v8, (t4)
391    vle16.v v10, (t5)
392
393    vwsub.vv v12, v4, v2
394    vwsub.vv v14, v6, v2
395    vwsub.vv v16, v8, v2
396    vwsub.vv v18, v10, v2
397
398    vsetvli zero, zero, e32, m2, ta, mu
399
400    li t4, 2
401    constrain_vectors v4, v6, v2, a6, t1, v12, v14
402    constrain_vectors v8, v10, v2, a6, t1, v16, v18
403
404    vmul.vx v28, v18, t4
405    vmacc.vx v28, t4, v16
406    vmacc.vx v28, t4, v14
407    vmacc.vx v28, t4, v12
408
409    lb t4, 1(t3)
410    lb t5, 1(t2)
411
412    sh1add t6, t4, a0
413    slli t4, t4, 1
414    sub t4, a0, t4
415
416    vsetvli zero, zero, e16, m1, ta, ma
417
418    vle16.v v4, (t6)
419    vle16.v v6, (t4)
420
421    sh1add t4, t5, a0
422    slli t5, t5, 1
423    sub t5, a0, t5
424
425    vle16.v v8, (t4)
426    vle16.v v10, (t5)
427
428    vwsub.vv v12, v4, v2
429    vwsub.vv v14, v6, v2
430    vwsub.vv v16, v8, v2
431    vwsub.vv v18, v10, v2
432
433    vsetvli zero, zero, e32, m2, ta, mu
434
435    constrain_vectors v4, v6, v2, a6, t1, v12, v14
436    constrain_vectors v8, v10, v2, a6, t1, v16, v18
437
438    vadd.vv v4, v28, v12
439    vadd.vv v28, v4, v14
440    vadd.vv v4, v28, v16
441    vadd.vv v28, v4, v18
442
443    vmslt.vx v0, v28, zero
444    vadd.vi v28, v28, -1, v0.t
445
446    vsetvli zero, zero, e16, m1, ta, ma
447
448    vnclip.wi v24, v28, 4
449
450    vadd.vv v28, v2, v24
451
452    vse16.v v28, (a2)
453
454    add a2, a2, a3
455    sh1add a0, a1, a0
456
457    addi t0, t0, -1
458    bnez t0, 2b
459
460    addi sp, sp, 32 + 144*2
461    ret
462cdef_filter_pri_sec_\w\()x\h:
463
464    li t1, 63
465    clz t2, a5
466    clz t3, a6
467    sub t2, t1, t2
468    sub t3, t1, t3
469    sub t1, a4, t2
470    sub t2, a4, t3
471
472    li t0, \h
473
474    la t3, dav1d_cdef_directions
475
476    vsetivli zero, \w, e16, m1, ta, ma
477    blt zero, t1, 3f
478    mv t1, zero
4793:
480    li t5, 64-8
481    ld t4, 32 + 2*144 + 16(sp) # bitdepth_max
482    clz t4, t4
483    sub t4, t5, t4
484    sra t4, a5, t4
485    li t6, 4
486    andi t5, t4, 1
487    sub t4, t6, t5
488
489    addi t5, a7, 2
490
491    sh1add t5, t5, t3
492
493    vle16.v v2, (a2)
494
495    lb t6, 0(t5)
496
497    sh1add a4, t6, a0
498    slli t6, t6, 1
499    sub t6, a0, t6
500
501    vle16.v v4, (a4)
502    vle16.v v6, (t6)
503
504    vminu.vv v20, v4, v2
505    vmax.vv v24, v4, v2
506    vminu.vv v20, v6, v20
507    vmax.vv v24, v6, v24
508
509    vwsub.vv v8, v4, v2
510    vwsub.vv v16, v6, v2
511
512    vsetvli zero, zero, e32, m2, ta, mu
513
514    constrain_vectors v4, v6, v2, a5, t1, v8, v16
515
516    vmul.vx v28, v16, t4
517    vmacc.vx v28, t4, v8
518
519    andi t4, t4, 3
520    ori t4, t4, 2
521
522    lb t6, 1(t5)
523
524    sh1add a4, t6, a0
525    slli t6, t6, 1
526    sub t6, a0, t6
527
528    vsetvli zero, zero, e16, m1, ta, ma
529
530    vle16.v v4, (a4)
531    vle16.v v6, (t6)
532
533    vminu.vv v20, v4, v20
534    vmax.vv v24, v4, v24
535    vminu.vv v20, v6, v20
536    vmax.vv v24, v6, v24
537
538    vwsub.vv v8, v4, v2
539    vwsub.vv v16, v6, v2
540
541    vsetvli zero, zero, e32, m2, ta, mu
542
543    constrain_vectors v4, v6, v2, a5, t1, v8, v16
544
545    addi t5, a7, 4
546    vmacc.vx v28, t4, v16
547    vmacc.vx v28, t4, v8
548
549    sh1add t5, t5, t3
550
551    lb t6, 0(t5)
552
553    sh1add a4, t6, a0
554    slli t6, t6, 1
555    sub t6, a0, t6
556
557    vsetvli zero, zero, e16, m1, ta, ma
558
559    vle16.v v4, (a4)
560    vle16.v v6, (t6)
561
562    vminu.vv v20, v4, v20
563    vmax.vv v24, v4, v24
564    vminu.vv v20, v6, v20
565    vmax.vv v24, v6, v24
566
567    vwsub.vv v8, v4, v2
568    vwsub.vv v16, v6, v2
569
570    vsetvli zero, zero, e32, m2, ta, mu
571
572    li t6, 2
573    constrain_vectors v4, v6, v2, a6, t2, v8, v16
574
575    vmacc.vx v28, t6, v16
576    vmacc.vx v28, t6, v8
577
578    lb t6, 1(t5)
579
580    sh1add a4, t6, a0
581    slli t6, t6, 1
582    sub t6, a0, t6
583
584    vsetvli zero, zero, e16, m1, ta, ma
585
586    vle16.v v4, (a4)
587    vle16.v v6, (t6)
588
589    vminu.vv v20, v4, v20
590    vmax.vv v24, v4, v24
591    vminu.vv v20, v6, v20
592    vmax.vv v24, v6, v24
593
594    vwsub.vv v8, v4, v2
595    vwsub.vv v16, v6, v2
596
597    vsetvli zero, zero, e32, m2, ta, mu
598
599    constrain_vectors v4, v6, v2, a6, t2, v8, v16
600
601    sh1add t5, a7, t3
602
603    vadd.vv v4, v28, v8
604    vadd.vv v28, v4, v16
605
606    vsetvli zero, zero, e16, m1, ta, ma
607
608    lb t6, 0(t5)
609
610    sh1add a4, t6, a0
611    slli t6, t6, 1
612    sub t6, a0, t6
613
614    vle16.v v4, (a4)
615    vle16.v v6, (t6)
616
617    vminu.vv v20, v4, v20
618    vmax.vv v24, v4, v24
619    vminu.vv v20, v6, v20
620    vmax.vv v24, v6, v24
621
622    vwsub.vv v8, v4, v2
623    vwsub.vv v16, v6, v2
624
625    vsetvli zero, zero, e32, m2, ta, mu
626
627    li t6, 2
628    constrain_vectors v4, v6, v2, a6, t2, v8, v16
629
630    vmacc.vx v28, t6, v16
631    vmacc.vx v28, t6, v8
632
633    lb t6, 1(t5)
634
635    sh1add a4, t6, a0
636    slli t6, t6, 1
637    sub t6, a0, t6
638
639    vsetvli zero, zero, e16, m1, ta, ma
640
641    vle16.v v4, (a4)
642    vle16.v v6, (t6)
643
644    vminu.vv v20, v4, v20
645    vmax.vv v24, v4, v24
646    vminu.vv v20, v6, v20
647    vmax.vv v24, v6, v24
648
649    vwsub.vv v8, v4, v2
650    vwsub.vv v16, v6, v2
651
652    vsetvli zero, zero, e32, m2, ta, mu
653
654    constrain_vectors v4, v6, v2, a6, t2, v8, v16
655
656    vadd.vv v4, v28, v8
657    vadd.vv v28, v4, v16
658
659    vmslt.vx v0, v28, zero
660    vadd.vi v28, v28, -1, v0.t
661
662    vsetvli zero, zero, e16, m1, ta, ma
663
664    vnclip.wi v16, v28, 4
665
666    vadd.vv v28, v2, v16
667
668    vmslt.vv v0, v20, v28
669    vmerge.vvm v4, v20, v28, v0
670
671    vmslt.vv v0, v4, v24
672    vmerge.vvm v28, v24, v4, v0
673
674    vse16.v v28, (a2)
675
676    add a2, a2, a3
677    sh1add a0, a1, a0
678
679    addi t0, t0, -1
680    bnez t0, 3b
681
682    addi sp, sp, 32 + 144*2
683    ret
684endfunc
685.endm
686
687cdef_fn 4, 4
688cdef_fn 4, 8
689cdef_fn 8, 8
690