xref: /aosp_15_r20/external/libdav1d/src/riscv/64/ipred16.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/******************************************************************************
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2024, Bogdan Gligorijevic
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28#include "src/riscv/asm.S"
29
30function dc_gen_16bpc_rvv, export=1, ext="v,zba,zbb"
31    .variant_cc dav1d_dc_gen_8bpc_rvv
32    add t1, a1, a2
33    srli t5, t1, 1
34    mv t1, a1
35    addi t2, a0, 2
36    vsetvli zero, t1, e32, m8, ta, ma
37    vmv.v.x v0, zero
381:
39    vsetvli t3, t1, e16, m4, tu, ma
40    vle16.v v8, (t2)
41    vwaddu.wv v0, v0, v8
42    sub t1, t1, t3
43
44    sh1add t2, t3, t2
45    bnez t1, 1b
46
47    mv t1, a2
48    mv t2, a0
49    vsetvli zero, t1, e32, m8, ta, ma
50    vmv.v.x v16, zero
512:
52    vsetvli t3, t1, e16, m4, tu, ma
53    sub t1, t1, t3
54    sll t3, t3, 1
55    sub t2, t2, t3
56    vle16.v v8, (t2)
57    vwaddu.wv v16, v16, v8
58
59    bnez t1, 2b
60
61    vsetvli zero, a1, e32, m8, ta, ma
62    vmv.s.x v24, t5
63    vmv.s.x v25, zero
64    vredsum.vs v8, v0, v24
65    vsetvli zero, a2, e32, m8, ta, ma
66    vredsum.vs v0, v16, v25
67    vmv.x.s t5, v8
68    vmv.x.s t1, v0
69    add t5, t5, t1
70
71    add t1, a1, a2
72    ctz t1, t1
73
74    srl a0, t5, t1
75
76    beq a1, a2, 5f
77    slli t1, a1, 1
78    sltu t2, t1, a2
79    slli t3, a2, 1
80    sltu t1, t3, a1
81    or t1, t1, t2
82    bnez t1, 3f
83
84    li t1, 0xAAAB
85    j 4f
863:
87    li t1, 0x6667
884:
89    mul a0, a0, t1
90    li t1, 17
91    srl a0, a0, t1
925:
93    jr t0
94endfunc
95
96function dc_gen_top_16bpc_rvv, export=1, ext="v,zba,zbb"
97    .variant_cc dav1d_dc_gen_top_16bpc_rvv
98    mv t1, a1
99    srli t5, a1, 1
100    addi a0, a0, 2
101    vsetvli zero, t1, e32, m2, ta, ma
102    vmv.v.x v0, zero
1031:
104    vsetvli t3, t1, e16, m1, tu, ma
105    vle16.v v4, (a0)
106    vwaddu.wv v0, v0, v4
107
108    sh1add a0, t3, a0
109    sub t1, t1, t3
110    bnez t1, 1b
111
112    j dc_gen_sum_up_16bpc_rvv
113endfunc
114
115function dc_gen_left_16bpc_rvv, export=1, ext="v,zba,zbb"
116    .variant_cc dav1d_dc_gen_left_16bpc_rvv
117    mv t1, a1
118    srli t5, a1, 1
119    vsetvli zero, t1, e32, m2, ta, ma
120    vmv.v.x v0, zero
1211:
122    vsetvli t3, t1, e16, m1, tu, ma
123    sub t1, t1, t3
124    slli t3, t3, 1
125    sub a0, a0, t3
126    vle16.v v4, (a0)
127    vwaddu.wv v0, v0, v4
128
129    bnez t1, 1b
130
131    j dc_gen_sum_up_16bpc_rvv
132endfunc
133
134function dc_gen_sum_up_16bpc_rvv, export=1, ext="v,zba,zbb"
135    .variant_cc dav1d_dc_gen_sum_up_16bpc_rvv
136
137    vsetvli zero, a1, e32, m2, ta, ma
138    vmv.s.x v4, t5
139    vredsum.vs v8, v0, v4
140    vmv.x.s t5, v8
141
142    ctz t1, a1
143
144    srl a0, t5, t1
145    jr t0
146endfunc
147
148function cfl_pred_16bpc_rvv, export=1, ext="v,zba"
149    csrw vxrm, zero
1501:
151    li t2, 0
152    mv t3, a2
1532:
154    vsetvli t0, t3, e16, m2, ta, ma
155    sh1add t4, t2, a0
156    vle16.v v0, (a5)
157    sh1add a5, t0, a5
158
159    vwmul.vx v4, v0, a6
160    vsetvli zero, zero, e32, m4, ta, mu
161    vneg.v v8, v4
162    vmslt.vx v0, v4, x0
163    vmax.vv v12, v8, v4
164    vssra.vi v16, v12, 6
165    vneg.v v16, v16, v0.t
166    vadd.vx v20, v16, a4
167    vmax.vx v0, v20, zero
168    vmin.vx v0, v0, a7
169    vsetvli zero, zero, e16, m2, ta, ma
170    vnclipu.wi v4, v0, 0
171    vse16.v v4, (t4)
172    add t2, t0, t2
173    sub t3, t3, t0
174    bnez t3, 2b
175    addi a3, a3, -1
176    add a0, a0, a1
177
178    bnez a3, 1b
179    ret
180endfunc
181
182function ipred_cfl_16bpc_rvv, export=1, ext=v
183    mv t6, a0 # dst
184    mv a0, a2 # topleft
185    mv t4, a1 # stride
186    mv a1, a3 # width
187    mv a2, a4 # height
188    jal t0, dc_gen_16bpc_rvv
189    mv a2, a3 # width
190    mv a3, a4 # height
191    mv a4, a0 # dc_get_top
192    mv a0, t6 # dst
193    mv a1, t4 # stride
194    j cfl_pred_16bpc_rvv
195endfunc
196
197function ipred_cfl_128_16bpc_rvv, export=1, ext="v,zba"
198    # dc = (bitdepth_max + 1) >> 1, then just rearrange registers
199    mv a2, a3
200    mv a3, a4
201    addi a4, a7, 1
202    srli a4, a4, 1
203
204    j cfl_pred_16bpc_rvv
205endfunc
206
207function ipred_cfl_top_16bpc_rvv, export=1, ext=v
208    mv t6, a0 # dst
209    mv a0, a2 # topleft
210    mv t4, a1 # stride
211    mv a1, a3 # width
212    jal t0, dc_gen_top_16bpc_rvv
213    mv a3, a4 # height
214    mv a4, a0 # dc_get_top
215    mv a0, t6 # dst
216    mv a2, a1 # width
217    mv a1, t4 # stride
218    j cfl_pred_16bpc_rvv
219endfunc
220
221function ipred_cfl_left_16bpc_rvv, export=1, ext=v
222    mv t6, a0 # dst
223    mv a0, a2 # topleft
224    mv t4, a1 # stride
225    mv a1, a4 # height
226    mv a2, a3 # width
227    jal t0, dc_gen_left_16bpc_rvv
228    mv a3, a4 # height
229    mv a4, a0 # dc_get_top
230    mv a1, t4 # stride
231    mv a0, t6 # dst
232    j cfl_pred_16bpc_rvv
233endfunc
234
235function ipred_paeth_16bpc_rvv, export=1, ext="v,zba"
236    csrw vxrm, zero
237    li t0, 0
238    mv t3, a2
239    lhu t1, (a2)
240    addi a6, a2, -2
241    addi a2, a2, 2
2421:
243    lhu t2, (a6)
244    mv t3, a3
2452:
246    sub t5, a3, t3
247    sh1add t5, t5, a2
248    vsetvli t6, t3, e16, m2, ta, ma
249    vle16.v v2, (t5)
250    vwaddu.vx v4, v2, t2
251
252    vsetvli zero, zero, e32, m4, ta, mu
253    vsub.vx v8, v4, t1
254    vzext.vf2 v24, v2
255    vsub.vx v12, v8, t1
256    vmslt.vx v0, v12, zero
257    vneg.v v12, v12, v0.t
258    vsub.vx v16, v8, t2
259    vmslt.vx v0, v16, zero
260    vneg.v v16, v16, v0.t
261    vsub.vv v20, v8, v24
262    vmslt.vx v0, v20, zero
263    vneg.v v20, v20, v0.t
264
265    sub t5, a3, t3
266    vmsleu.vv v4, v16, v20
267    vmsleu.vv v5, v16, v12
268    vmsgtu.vv v0, v20, v12
269    vmand.mm v6, v4, v5
270
271    vsetvli zero, zero, e16, m2, ta, ma
272    vmerge.vxm v8, v2, t1, v0
273    vmmv.m v0, v6
274    sh1add t5, t5, a0
275    sub t3, t3, t6
276    vmerge.vxm v4, v8, t2, v0
277
278    vse16.v v4, (t5)
279
280    bnez t3, 2b
281
282    addi a4, a4, -1
283    addi a6, a6, -2
284    add a0, a0, a1
285    bnez a4, 1b
286    ret
287endfunc
288
289function ipred_smooth_16bpc_rvv, export=1, ext="v,zba"
290    csrw vxrm, zero
291    la t0, dav1d_sm_weights
292    add t1, t0, a3
293    sh1add t2, a3, a2
294    slli t3, a4, 1
295    add t0, t0, a4
296    lhu t2, (t2)
297    sub t3, a2, t3
298    addi a6, a2, -2
299    addi a2, a2, 2
300    lhu t3, (t3)
3011:
302    mv t6, a3
303
304    lhu a7, (a6)
305    lbu t4, (t0)
3062:
307    li a5, 256
308    vsetvli t5, t6, e16, m2, ta, ma
309    vle8.v v2, (t1)
310    add t1, t1, t5
311    vle16.v v4, (a2)
312    sh1add a2, t5, a2
313    sub a5, a5, t4
314
315    vwmul.vx v8, v4, t4
316    mul a5, a5, t3
317
318    vsetvli zero, zero, e32, m4, ta, ma
319    vadd.vx v4, v8, a5
320
321    li a5, 256
322    vzext.vf4 v12, v2
323    vmul.vx v8, v12, a7
324
325    vrsub.vx v12, v12, a5
326    vmacc.vx v8, t2, v12
327    vadd.vv v12, v4, v8
328    vsetvli zero, zero, e32, m4, ta, ma
329
330    sub a5, a3, t6
331    sub t6, t6, t5
332    sh1add a5, a5, a0
333    vsetvli zero, zero, e16, m2, ta, ma
334    vnclipu.wi v2, v12, 9
335    vse16.v v2, (a5)
336
337    bnez t6, 2b
338
339    sub t1, t1, a3
340    slli t6, a3, 1
341    add a0, a0, a1
342    sub a2, a2, t6
343    addi a4, a4, -1
344    addi t0, t0, 1
345    addi a6, a6, -2
346    bnez a4, 1b
347
348    ret
349endfunc
350
351function ipred_smooth_v_16bpc_rvv, export=1, ext="v,zba"
352    csrw vxrm, zero
353    la t0, dav1d_sm_weights
354    slli t3, a4, 1
355    add t0, t0, a4
356    sub t3, a2, t3
357    addi a2, a2, 2
358    lhu t3, (t3)
3591:
360    mv t6, a3
361
362    lbu t4, (t0)
3632:
364    li a5, 256
365    vsetvli t5, t6, e16, m2, ta, ma
366    vle16.v v4, (a2)
367    sh1add a2, t5, a2
368    sub a5, a5, t4
369
370    vwmul.vx v8, v4, t4
371    mul a5, a5, t3
372
373    vsetvli zero, zero, e32, m4, ta, ma
374    vadd.vx v4, v8, a5
375    vsetvli zero, zero, e32, m4, ta, ma
376
377    sub a5, a3, t6
378    sub t6, t6, t5
379    sh1add a5, a5, a0
380    vsetvli zero, zero, e16, m2, ta, ma
381    vnclipu.wi v2, v4, 8
382    vse16.v v2, (a5)
383
384    bnez t6, 2b
385
386    slli t6, a3, 1
387    add a0, a0, a1
388    sub a2, a2, t6
389    addi a4, a4, -1
390    addi t0, t0, 1
391    bnez a4, 1b
392
393    ret
394endfunc
395
396function ipred_smooth_h_16bpc_rvv, export=1, ext="v,zba"
397    csrw vxrm, zero
398    la t0, dav1d_sm_weights
399    add t1, t0, a3
400    sh1add t2, a3, a2
401    lhu t2, (t2)
402    addi a6, a2, -2
4031:
404    mv t6, a3
405
406    lhu a7, (a6)
4072:
408    vsetvli t5, t6, e16, m2, ta, ma
409    vle8.v v2, (t1)
410    add t1, t1, t5
411
412    li a5, 256
413    vsetvli zero, zero, e32, m4, ta, ma
414    vzext.vf4 v12, v2
415    vmul.vx v8, v12, a7
416
417    vrsub.vx v12, v12, a5
418    vmacc.vx v8, t2, v12
419
420    sub a5, a3, t6
421    sub t6, t6, t5
422    sh1add a5, a5, a0
423    vsetvli zero, zero, e16, m2, ta, ma
424    vnclipu.wi v2, v8, 8
425    vse16.v v2, (a5)
426
427    bnez t6, 2b
428
429    sub t1, t1, a3
430    add a0, a0, a1
431    addi a4, a4, -1
432    addi a6, a6, -2
433    bnez a4, 1b
434
435    ret
436endfunc
437
438function pal_pred_16bpc_rvv, export=1, ext="v,zba"
439    csrw vxrm, zero
440    vsetivli t5, 8, e16, m1, ta, ma
441    vle16.v v30, (a2)
442    li t0, 4
443    srli t1, a4, 1
444    li t2, 1
4451:
446    mv t4, a4
4472:
448    vsetvli t5, t1, e8, mf2, ta, ma
449    vle8.v v0, (a3)
450    add a3, a3, t5
451    vand.vi v1, v0, 7
452    sub t6, a4, t4
453    vsrl.vi v2, v0, 4
454    vwmul.vx v4, v1, t2
455    vwmul.vx v6, v2, t2
456    vsetvli zero, zero, e16, m1, ta, ma
457    sh1add t6, t6, a0
458    vrgather.vv v8, v30, v4
459    addi t3, t6, 2
460    vrgather.vv v10, v30, v6
461    slli t5, t5, 1
462    vsse16.v v8, (t6), t0
463    vsse16.v v10, (t3), t0
464
465    sub t4, t4, t5
466    bnez t4, 2b
467    add a0, a0, a1
468    addi a5, a5, -1
469    bnez a5, 1b
470    ret
471endfunc
472