xref: /aosp_15_r20/external/libdav1d/src/riscv/64/ipred.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/******************************************************************************
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2024, Bogdan Gligorijevic
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28#include "src/riscv/asm.S"
29
30function dc_gen_8bpc_rvv, export=1, ext="v,zbb"
31    .variant_cc dav1d_dc_gen_8bpc_rvv
32    add t1, a1, a2
33    srli t5, t1, 1
34    mv t1, a1
35    addi t2, a0, 1
36    vsetvli zero, t1, e16, m4, ta, ma
37    vmv.v.x v0, zero
381:
39    vsetvli t3, t1, e8, m2, tu, ma
40    vle8.v v4, (t2)
41    vwaddu.wv v0, v0, v4
42
43    sub t1, t1, t3
44    add t2, t2, t3
45    bnez t1, 1b
46
47    mv t1, a2
48    mv t2, a0
49    vsetvli zero, t1, e16, m4, ta, ma
50    vmv.v.x v8, zero
512:
52    vsetvli t3, t1, e8, m2, tu, ma
53    sub t2, t2, t3
54    vle8.v v4, (t2)
55    vwaddu.wv v8, v8, v4
56    sub t1, t1, t3
57
58    bnez t1, 2b
59
60    vsetvli zero, zero, e32, m8, ta, ma
61    vmv.s.x v16, t5
62    vmv.s.x v12, zero
63    vsetvli zero, a1, e16, m4, ta, ma
64    vwredsum.vs v24, v0, v16
65    vsetvli zero, a2, e16, m4, ta, ma
66    vwredsum.vs v16, v8, v12
67    vsetvli zero, zero, e32, m8, ta, ma
68    vmv.x.s t5, v24
69    vmv.x.s t1, v16
70    add t5, t5, t1
71
72    add t1, a1, a2
73    ctz t1, t1
74
75    srl a0, t5, t1
76
77
78    beq a1, a2, 5f
79    slli t1, a1, 1
80    sltu t2, t1, a2
81    slli t3, a2, 1
82    sltu t1, t3, a1
83    or t1, t1, t2
84    bnez t1, 3f
85
86    li t1, 0x5556
87    j 4f
883:
89    li t1, 0x3334
904:
91    mul a0, a0, t1
92    srli a0, a0, 16
935:
94    jr t0
95endfunc
96
97function dc_gen_top_8bpc_rvv, export=1, ext="v,zbb"
98    .variant_cc dav1d_dc_gen_top_8bpc_rvv
99    mv t1, a1
100    srli t5, a1, 1
101    addi a0, a0, 1
102    vsetvli zero, t1, e16, m4, ta, ma
103    vmv.v.x v0, zero
1041:
105    vsetvli t3, t1, e8, m2, tu, ma
106    vle8.v v4, (a0)
107    vwaddu.wv v0, v0, v4
108    sub t1, t1, t3
109
110    add a0, a0, t3
111    bnez t1, 1b
112    j dc_gen_sum_up_8bpc_rvv
113endfunc
114
115function dc_gen_left_8bpc_rvv, export=1, ext="v,zbb"
116    .variant_cc dav1d_dc_gen_left_8bpc_rvv
117    mv t1, a1
118    srli t5, a1, 1
119    vsetvli t2, t1, e16, m4, ta, ma
120    vmv.v.x v0, zero
121
1221:
123    vsetvli t3, t1, e8, m2, tu, ma
124    sub a0, a0, t3
125    vle8.v v4, (a0)
126    vwaddu.wv v0, v0, v4
127    sub t1, t1, t3
128    bnez t1, 1b
129
130    j dc_gen_sum_up_8bpc_rvv
131endfunc
132
133function dc_gen_sum_up_8bpc_rvv, export=1, ext="v,zbb"
134    .variant_cc dav1d_dc_gen_sum_up_8bpc_rvv
135    vsetvli zero, a1, e32, m8, ta, ma
136    vmv.s.x v4, t5
137    vsetvli zero, zero, e16, m4, ta, ma
138    vwredsum.vs v8, v0, v4
139    vsetvli zero, zero, e32, m8, ta, ma
140    vmv.x.s t5, v8
141
142    ctz t1, a1
143
144    srl a0, t5, t1
145    jr t0
146endfunc
147
148function cfl_pred_8bpc_rvv, export=1, ext="v,zba"
149    csrw vxrm, zero
1501:
151    li t2, 0
152    mv t3, a2
1532:
154    vsetvli t0, t3, e16, m2, ta, ma
155    add t4, a0, t2
156    vle16.v v0, (a5)
157    sh1add a5, t0, a5
158
159    vwmul.vx v4, v0, a6
160    vsetvli zero, zero, e32, m4, ta, mu
161    vneg.v v8, v4
162    vmslt.vx v0, v4, x0
163    vmax.vv v12, v8, v4
164    vssra.vi v16, v12, 6
165    vneg.v v16, v16, v0.t
166    vadd.vx v20, v16, a4
167    vmax.vx v0, v20, zero
168    vsetvli zero, zero, e16, m2, ta, ma
169    vnclipu.wi v4, v0, 0
170    vsetvli zero, zero, e8, m1, ta, ma
171    vnclipu.wi v0, v4, 0
172    vse8.v v0, (t4)
173    add t2, t0, t2
174    sub t3, t3, t0
175    bnez t3, 2b
176    addi a3, a3, -1
177    add a0, a0, a1
178
179    bnez a3, 1b
180    ret
181endfunc
182
183function ipred_cfl_8bpc_rvv, export=1, ext=v
184    mv t6, a0 # dst
185    mv a0, a2 # topleft
186    mv t4, a1 # stride
187    mv a1, a3 # width
188    mv a2, a4 # height
189    jal t0, dc_gen_8bpc_rvv
190    mv a2, a3 # width
191    mv a3, a4 # height
192    mv a4, a0 # dc_get_top
193    mv a0, t6 # dst
194    mv a1, t4 # stride
195    j cfl_pred_8bpc_rvv
196endfunc
197
198function ipred_cfl_128_8bpc_rvv, export=1, ext="v,zba"
199    # dc = 128, then just rearrange registers
200    mv a2, a3
201    mv a3, a4
202    li a4, 128
203
204    j cfl_pred_8bpc_rvv
205endfunc
206
207function ipred_cfl_top_8bpc_rvv, export=1, ext=v
208    mv t6, a0 # dst
209    mv a0, a2 # topleft
210    mv t4, a1 # stride
211    mv a1, a3 # width
212    jal t0, dc_gen_top_8bpc_rvv
213    mv a3, a4 # height
214    mv a4, a0 # dc_get_top
215    mv a0, t6 # dst
216    mv a2, a1 # width
217    mv a1, t4 # stride
218    j cfl_pred_8bpc_rvv
219endfunc
220
221function ipred_cfl_left_8bpc_rvv, export=1, ext="v,zba"
222    mv t6, a0 # dst
223    mv a0, a2 # topleft
224    mv t4, a1 # stride
225    mv a1, a4 # height
226    mv a2, a3 # width
227    jal t0, dc_gen_left_8bpc_rvv
228    mv a3, a4 # height
229    mv a4, a0 # dc_get_left
230    mv a1, t4 # stride
231    mv a0, t6 # dst
232    j cfl_pred_8bpc_rvv
233endfunc
234
235function ipred_paeth_8bpc_rvv, export=1, ext="v,zba"
236    csrw vxrm, zero
237    li t0, 0
238    mv t3, a2
239    lbu t1, (a2)
240    addi a6, a2, -1
241    addi a2, a2, 1
2421:
243    lbu t2, (a6)
244    mv t3, a3
2452:
246    sub t5, a3, t3
247    add t5, a2, t5
248    vsetvli t6, t3, e8, m1, ta, ma
249    vle8.v v2, (t5)
250    vwaddu.vx v4, v2, t2
251    vsetvli zero, zero, e16, m2, ta, ma
252    vwsub.vx v8, v4, t1
253
254    vsetvli zero, zero, e32, m4, ta, mu
255    vzext.vf4 v24, v2
256    vsub.vx v12, v8, t1
257    vmslt.vx v0, v12, zero
258    vneg.v v12, v12, v0.t
259    vsub.vx v16, v8, t2
260    vmslt.vx v0, v16, zero
261    vneg.v v16, v16, v0.t
262    vsub.vv v20, v8, v24
263    vmslt.vx v0, v20, zero
264    vneg.v v20, v20, v0.t
265
266    sub t5, a3, t3
267    vmsleu.vv v4, v16, v20
268    vmsleu.vv v5, v16, v12
269    vmsgtu.vv v0, v20, v12
270    vmand.mm v6, v4, v5
271
272    vsetvli zero, zero, e8, m1, ta, ma
273    vmerge.vxm v8, v2, t1, v0
274    vmmv.m v0, v6
275    add t5, a0, t5
276    sub t3, t3, t6
277    vmerge.vxm v4, v8, t2, v0
278
279    vse8.v v4, (t5)
280
281    bnez t3, 2b
282
283    addi a4, a4, -1
284    addi a6, a6, -1
285    add a0, a0, a1
286    bnez a4, 1b
287    ret
288endfunc
289
290function ipred_smooth_8bpc_rvv, export=1, ext="v,zba"
291    csrw vxrm, zero
292    la t0, dav1d_sm_weights
293    add t1, t0, a3
294    add t2, a2, a3
295    add t0, t0, a4
296    lbu t2, (t2)
297    sub t3, a2, a4
298    addi a6, a2, -1
299    addi a2, a2, 1
300    lbu t3, (t3)
3011:
302    mv t6, a3
303
304    lbu a7, (a6)
305    lbu t4, (t0)
3062:
307    li a5, 256
308    vsetvli t5, t6, e8, m1, ta, ma
309    vle8.v v2, (t1)
310    add t1, t1, t5
311    vle8.v v4, (a2)
312    add a2, a2, t5
313    sub a5, a5, t4
314
315    vwmulu.vx v8, v4, t4
316    vsetvli zero, zero, e16, m2, ta, ma
317    mul a5, a5, t3
318
319    vadd.vx v4, v8, a5
320    vsetvli zero, zero, e8, m1, ta, ma
321    vwmulu.vx v8, v2, a7
322
323    vneg.v v12, v2
324    vwmaccu.vx v8, t2, v12
325    vsetvli zero, zero, e16, m2, ta, ma
326    vwaddu.vv v12, v4, v8
327
328    sub a5, a3, t6
329    sub t6, t6, t5
330    add a5, a5, a0
331    vnclipu.wi v2, v12, 9
332    vsetvli zero, zero, e8, m1, ta, ma
333    vnclipu.wi v0, v2, 0
334    vse8.v v0, (a5)
335
336    bnez t6, 2b
337
338    sub t1, t1, a3
339    add a0, a0, a1
340    sub a2, a2, a3
341    addi a4, a4, -1
342    addi t0, t0, 1
343    addi a6, a6, -1
344    bnez a4, 1b
345
346    ret
347endfunc
348
349function ipred_smooth_v_8bpc_rvv, export=1, ext="v,zba"
350    csrw vxrm, zero
351    la t0, dav1d_sm_weights
352    add t2, a2, a3
353    add t0, t0, a4
354    sub t3, a2, a4
355    addi a2, a2, 1
356    lbu t3, (t3)
3571:
358    mv t6, a3
359
360    lbu t4, (t0)
3612:
362    li a5, 256
363    vsetvli t5, t6, e8, m1, ta, ma
364    vle8.v v4, (a2)
365    add a2, a2, t5
366    sub a5, a5, t4
367
368    vwmulu.vx v8, v4, t4
369    vsetvli zero, zero, e16, m2, ta, ma
370    mul a5, a5, t3
371    vwaddu.vx v4, v8, a5
372
373    sub a5, a3, t6
374    sub t6, t6, t5
375    add a5, a5, a0
376    vsetvli zero, zero, e16, m2, ta, ma
377    vnclipu.wi v2, v4, 8
378    vsetvli zero, zero, e8, m1, ta, ma
379    vnclipu.wi v0, v2, 0
380    vse8.v v0, (a5)
381
382    bnez t6, 2b
383
384    add a0, a0, a1
385    sub a2, a2, a3
386    addi a4, a4, -1
387    addi t0, t0, 1
388    bnez a4, 1b
389
390    ret
391endfunc
392
393function ipred_smooth_h_8bpc_rvv, export=1, ext="v,zba"
394    csrw vxrm, zero
395    la t0, dav1d_sm_weights
396    add t1, t0, a3
397    add t2, a2, a3
398    lbu t2, (t2)
399    addi a6, a2, -1
4001:
401    mv t6, a3
402
403    lbu a7, (a6)
4042:
405    vsetvli t5, t6, e8, m1, ta, ma
406    vle8.v v2, (t1)
407    add t1, t1, t5
408
409    vwmulu.vx v8, v2, a7
410
411    vneg.v v12, v2
412    vwmaccu.vx v8, t2, v12
413
414    sub a5, a3, t6
415    sub t6, t6, t5
416    add a5, a5, a0
417    vsetvli zero, zero, e8, m1, ta, ma
418    vnclipu.wi v0, v8, 8
419    vse8.v v0, (a5)
420
421    bnez t6, 2b
422
423    sub t1, t1, a3
424    add a0, a0, a1
425    addi a4, a4, -1
426    addi a6, a6, -1
427    bnez a4, 1b
428
429    ret
430endfunc
431
432function pal_pred_8bpc_rvv, export=1, ext="v,zba"
433    csrw vxrm, zero
434    vsetivli t5, 8, e8, m1, ta, ma
435    vle8.v v30, (a2)
436    li t0, 2
437    srli t1, a4, 1
4381:
439    mv t4, a4
4402:
441    vsetvli t5, t1, e8, m1, ta, ma
442    vle8.v v0, (a3)
443    add a3, a3, t5
444    vsrl.vi v2, v0, 4
445    sub t6, a4, t4
446    vand.vi v1, v0, 7
447    add t6, a0, t6
448    vrgather.vv v3, v30, v1
449    addi t2, t6, 1
450    vrgather.vv v4, v30, v2
451    slli t5, t5, 1
452    vsse8.v v3, (t6), t0
453    sub t4, t4, t5
454    vsse8.v v4, (t2), t0
455
456    bnez t4, 2b
457    addi a5, a5, -1
458    add a0, a0, a1
459    bnez a5, 1b
460    ret
461endfunc
462