xref: /aosp_15_r20/external/libdav1d/src/riscv/64/itx.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/******************************************************************************
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2023, Nathan Egge
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28#include "src/riscv/asm.S"
29
30function inv_txfm_add_4x4_rvv, export=1, ext=v
31  csrw vxrm, zero
32
33  vsetivli zero, 4, e16, mf2, ta, ma
34  vle16.v v0, (a2)
35  addi t0, a2, 8
36  vle16.v v1, (t0)
37  addi t0, t0, 8
38  vle16.v v2, (t0)
39  addi t0, t0, 8
40  vle16.v v3, (t0)
41
42  jalr t0, a4
43
44  vmv.v.x v4, zero
45
46  vsseg4e16.v v0, (a2)
47  vle16.v v0, (a2)
48  vse16.v v4, (a2)
49  addi t0, a2, 8
50  vle16.v v1, (t0)
51  vse16.v v4, (t0)
52  addi t0, t0, 8
53  vle16.v v2, (t0)
54  vse16.v v4, (t0)
55  addi t0, t0, 8
56  vle16.v v3, (t0)
57  vse16.v v4, (t0)
58
59  jalr t0, a5
60
61  vssra.vi v0, v0, 4
62  vssra.vi v1, v1, 4
63  vssra.vi v2, v2, 4
64  vssra.vi v3, v3, 4
65
66itx_4x4_end:
67  vsetvli zero, zero, e8, mf4, ta, ma
68  vle8.v v4, (a0)
69  add t0, a0, a1
70  vle8.v v5, (t0)
71  add t0, t0, a1
72  vle8.v v6, (t0)
73  add t0, t0, a1
74  vle8.v v7, (t0)
75
76  vwaddu.wv v0, v0, v4
77  vwaddu.wv v1, v1, v5
78  vwaddu.wv v2, v2, v6
79  vwaddu.wv v3, v3, v7
80
81  vsetvli zero, zero, e16, mf2, ta, ma
82  vmax.vx v0, v0, zero
83  vmax.vx v1, v1, zero
84  vmax.vx v2, v2, zero
85  vmax.vx v3, v3, zero
86
87  vsetvli zero, zero, e8, mf4, ta, ma
88
89  vnclipu.wi v4, v0, 0
90  vnclipu.wi v5, v1, 0
91  vnclipu.wi v6, v2, 0
92  vnclipu.wi v7, v3, 0
93
94  vse8.v v4, (a0)
95  add a0, a0, a1
96  vse8.v v5, (a0)
97  add a0, a0, a1
98  vse8.v v6, (a0)
99  add a0, a0, a1
100  vse8.v v7, (a0)
101
102  ret
103endfunc
104
105function inv_identity_e16_x4_rvv, export=1, ext=v
106  li t1, (5793-4096)*8
107  vsmul.vx v4, v0, t1
108  vsmul.vx v5, v1, t1
109  vsmul.vx v6, v2, t1
110  vsmul.vx v7, v3, t1
111
112  vsadd.vv v0, v0, v4
113  vsadd.vv v1, v1, v5
114  vsadd.vv v2, v2, v6
115  vsadd.vv v3, v3, v7
116
117  jr t0
118endfunc
119
120.macro iwht_4
121  vadd.vv v0, v0, v1
122  vsub.vv v5, v2, v3
123  vsub.vv v4, v0, v5
124  vsra.vi v4, v4, 1
125  vsub.vv v2, v4, v1
126  vsub.vv v1, v4, v3
127  vadd.vv v3, v5, v2
128  vsub.vv v0, v0, v1
129.endm
130
131.macro idct_4 o0, o1, o2, o3
132  li t1, 2896
133  li t2, 1567
134  li t3, 3784
135
136  vwmul.vx v16, \o0, t1
137  vwmul.vx v18, \o0, t1
138  vwmacc.vx v16, t1, \o2
139  neg t1, t1
140  vwmacc.vx v18, t1, \o2
141
142  vwmul.vx v20, \o1, t3
143  neg t3, t3
144  vwmul.vx v22, \o1, t2
145  vwmacc.vx v20, t2, \o3
146  vwmacc.vx v22, t3, \o3
147
148  vnclip.wi v16, v16, 12
149  vnclip.wi v18, v18, 12
150  vnclip.wi v20, v20, 12
151  vnclip.wi v22, v22, 12
152
153  vsadd.vv \o0, v16, v20
154  vsadd.vv \o1, v18, v22
155  vssub.vv \o2, v18, v22
156  vssub.vv \o3, v16, v20
157.endm
158
159.macro iadst_4 o0, o1, o2, o3, lm2, lm
160  li t1, 1321
161  li t2, 3803
162  li t3, 2482
163
164  vwmul.vx v16, v0, t1
165  vwmul.vx v18, v0, t3
166  neg t1, t1
167  vwmacc.vx v16, t2, v2
168  vwmacc.vx v18, t1, v2
169  neg t2, t2
170  vwmacc.vx v16, t3, v3
171  vwmacc.vx v18, t2, v3
172
173  vwsub.vv v20,  v0, v2
174  vwadd.wv v20, v20, v3
175
176  li t1, 3344
177  vwmul.vx v22, v1, t1
178
179  vsetvli zero, zero, e32, \lm2, ta, ma
180
181  vmul.vx v20, v20, t1
182
183  vadd.vv v24, v16, v18
184  vadd.vv v16, v16, v22
185  vadd.vv v18, v18, v22
186  vsub.vv v22, v24, v22
187
188  vsetvli zero, zero, e16, \lm, ta, ma
189
190  vnclip.wi \o0, v16, 12
191  vnclip.wi \o1, v18, 12
192  vnclip.wi \o2, v20, 12
193  vnclip.wi \o3, v22, 12
194.endm
195
196function inv_dct_e16_x4_rvv, export=1, ext=v
197  idct_4 v0, v1, v2, v3
198  jr t0
199endfunc
200
201function inv_adst_e16_x4_rvv, export=1, ext=v
202  iadst_4 v0, v1, v2, v3, m1, mf2
203  jr t0
204endfunc
205
206function inv_flipadst_e16_x4_rvv, export=1, ext=v
207  iadst_4 v3, v2, v1, v0, m1, mf2
208  jr t0
209endfunc
210
211function inv_adst_e16_x4w_rvv, export=1, ext=v
212  iadst_4 v0, v1, v2, v3, m2, m1
213  jr t0
214endfunc
215
216function inv_flipadst_e16_x4w_rvv, export=1, ext=v
217  iadst_4 v3, v2, v1, v0, m2, m1
218  jr t0
219endfunc
220
221function inv_txfm_add_wht_wht_4x4_8bpc_rvv, export=1, ext=v
222  csrw vxrm, zero
223
224  vsetivli zero, 4, e16, mf2, ta, ma
225  vle16.v v0, (a2)
226  addi t0, a2, 8
227  vle16.v v1, (t0)
228  addi t0, t0, 8
229  vle16.v v2, (t0)
230  addi t0, t0, 8
231  vle16.v v3, (t0)
232
233  vsra.vi v0, v0, 2
234  vsra.vi v1, v1, 2
235  vsra.vi v2, v2, 2
236  vsra.vi v3, v3, 2
237
238  iwht_4
239
240  vmv.v.x v4, zero
241
242  vsseg4e16.v v0, (a2)
243  vle16.v v0, (a2)
244  vse16.v v4, (a2)
245  addi t0, a2, 8
246  vle16.v v1, (t0)
247  vse16.v v4, (t0)
248  addi t0, t0, 8
249  vle16.v v2, (t0)
250  vse16.v v4, (t0)
251  addi t0, t0, 8
252  vle16.v v3, (t0)
253  vse16.v v4, (t0)
254
255  iwht_4
256
257  j itx_4x4_end
258endfunc
259
260.macro def_fn_4x4 txfm1, txfm2
261function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v
262.ifc \txfm1\()_\txfm2, dct_dct
263  beqz a3, 1f
264.endif
265  la a4, inv_\txfm1\()_e16_x4_rvv
266  la a5, inv_\txfm2\()_e16_x4_rvv
267  j inv_txfm_add_4x4_rvv
268.ifc \txfm1\()_\txfm2, dct_dct
2691:
270  csrw vxrm, zero
271  vsetivli zero, 4, e16, mf2, ta, ma
272  ld t2, (a2)
273  li t1, 2896*8
274  vmv.v.x v0, t2
275  vsmul.vx v0, v0, t1
276  sd x0, (a2)
277  vsmul.vx v0, v0, t1
278  vssra.vi v0, v0, 4
279  vmv.v.v v1, v0
280  vmv.v.v v2, v0
281  vmv.v.v v3, v0
282  j itx_4x4_end
283.endif
284endfunc
285.endm
286
287def_fn_4x4 dct, dct
288def_fn_4x4 identity, identity
289def_fn_4x4 dct, adst
290def_fn_4x4 dct, flipadst
291def_fn_4x4 dct, identity
292def_fn_4x4 adst, dct
293def_fn_4x4 adst, adst
294def_fn_4x4 adst, flipadst
295def_fn_4x4 flipadst, dct
296def_fn_4x4 flipadst, adst
297def_fn_4x4 flipadst, flipadst
298def_fn_4x4 identity, dct
299def_fn_4x4 adst, identity
300def_fn_4x4 flipadst, identity
301def_fn_4x4 identity, adst
302def_fn_4x4 identity, flipadst
303
304.macro def_fn_8x8_base variant
305function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
306  csrw vxrm, zero
307
308  vsetivli zero, 8, e16, m1, ta, ma
309  vle16.v v0, (a2)
310  addi t0, a2, 16
311  vle16.v v1, (t0)
312  addi t0, t0, 16
313  vle16.v v2, (t0)
314  addi t0, t0, 16
315  vle16.v v3, (t0)
316  addi t0, t0, 16
317  vle16.v v4, (t0)
318  addi t0, t0, 16
319  vle16.v v5, (t0)
320  addi t0, t0, 16
321  vle16.v v6, (t0)
322  addi t0, t0, 16
323  vle16.v v7, (t0)
324
325.ifc \variant, identity_
326  // The identity vsadd.vv and downshift vssra.vi 1 cancel out
327
328  j L(itx_8x8_epilog)
329.else
330  jalr t0, a4
331
332  vssra.vi v0, v0, 1
333  vssra.vi v1, v1, 1
334  vssra.vi v2, v2, 1
335  vssra.vi v3, v3, 1
336  vssra.vi v4, v4, 1
337  vssra.vi v5, v5, 1
338  vssra.vi v6, v6, 1
339  vssra.vi v7, v7, 1
340
341L(itx_8x8_epilog):
342  vsseg8e16.v v0, (a2)
343  vle16.v v0, (a2)
344  addi t0, a2, 16
345  vle16.v v1, (t0)
346  addi t0, t0, 16
347  vle16.v v2, (t0)
348  addi t0, t0, 16
349  vle16.v v3, (t0)
350  addi t0, t0, 16
351  vle16.v v4, (t0)
352  addi t0, t0, 16
353  vle16.v v5, (t0)
354  addi t0, t0, 16
355  vle16.v v6, (t0)
356  addi t0, t0, 16
357  vle16.v v7, (t0)
358
359  jalr t0, a5
360
361  vssra.vi v0, v0, 4
362  vssra.vi v1, v1, 4
363  vssra.vi v2, v2, 4
364  vssra.vi v3, v3, 4
365  vssra.vi v4, v4, 4
366  vssra.vi v5, v5, 4
367  vssra.vi v6, v6, 4
368  vssra.vi v7, v7, 4
369
370  li t1, 64
371  vsetvli zero, t1, e16, m8, ta, ma
372  vmv.v.x v8, zero
373  vse16.v v8, (a2)
374
375itx_8x8_end:
376  vsetivli zero, 8, e8, mf2, ta, ma
377  vle8.v v8, (a0)
378  add t0, a0, a1
379  vle8.v v9, (t0)
380  add t0, t0, a1
381  vle8.v v10, (t0)
382  add t0, t0, a1
383  vle8.v v11, (t0)
384  add t0, t0, a1
385  vle8.v v12, (t0)
386  add t0, t0, a1
387  vle8.v v13, (t0)
388  add t0, t0, a1
389  vle8.v v14, (t0)
390  add t0, t0, a1
391  vle8.v v15, (t0)
392
393  vwaddu.wv v0, v0, v8
394  vwaddu.wv v1, v1, v9
395  vwaddu.wv v2, v2, v10
396  vwaddu.wv v3, v3, v11
397  vwaddu.wv v4, v4, v12
398  vwaddu.wv v5, v5, v13
399  vwaddu.wv v6, v6, v14
400  vwaddu.wv v7, v7, v15
401
402  vsetvli zero, zero, e16, m1, ta, ma
403  vmax.vx v0, v0, zero
404  vmax.vx v1, v1, zero
405  vmax.vx v2, v2, zero
406  vmax.vx v3, v3, zero
407  vmax.vx v4, v4, zero
408  vmax.vx v5, v5, zero
409  vmax.vx v6, v6, zero
410  vmax.vx v7, v7, zero
411
412  vsetvli zero, zero, e8, mf2, ta, ma
413
414  vnclipu.wi v8, v0, 0
415  vnclipu.wi v9, v1, 0
416  vnclipu.wi v10, v2, 0
417  vnclipu.wi v11, v3, 0
418  vnclipu.wi v12, v4, 0
419  vnclipu.wi v13, v5, 0
420  vnclipu.wi v14, v6, 0
421  vnclipu.wi v15, v7, 0
422
423  vse8.v v8, (a0)
424  add a0, a0, a1
425  vse8.v v9, (a0)
426  add a0, a0, a1
427  vse8.v v10, (a0)
428  add a0, a0, a1
429  vse8.v v11, (a0)
430  add a0, a0, a1
431  vse8.v v12, (a0)
432  add a0, a0, a1
433  vse8.v v13, (a0)
434  add a0, a0, a1
435  vse8.v v14, (a0)
436  add a0, a0, a1
437  vse8.v v15, (a0)
438
439  ret
440.endif
441endfunc
442.endm
443
444def_fn_8x8_base identity_
445def_fn_8x8_base
446
447function inv_identity_e16_x8_rvv, export=1, ext=v
448  vsadd.vv v0, v0, v0
449  vsadd.vv v1, v1, v1
450  vsadd.vv v2, v2, v2
451  vsadd.vv v3, v3, v3
452  vsadd.vv v4, v4, v4
453  vsadd.vv v5, v5, v5
454  vsadd.vv v6, v6, v6
455  vsadd.vv v7, v7, v7
456
457  jr t0
458endfunc
459
460.macro idct_8 o0, o1, o2, o3, o4, o5, o6, o7
461  idct_4 \o0, \o2, \o4, \o6
462
463  li t1, 799
464  li t2, 4017
465  li t3, 3406
466  li t4, 2276
467
468  vwmul.vx v22, \o1, t2
469  neg t2, t2
470  vwmul.vx v16, \o1, t1
471  vwmacc.vx v22, t1, \o7
472  vwmacc.vx v16, t2, \o7
473
474  vwmul.vx v20, \o5, t4
475  neg t4, t4
476  vwmul.vx v18, \o5, t3
477  vwmacc.vx v20, t3, \o3
478  vwmacc.vx v18, t4, \o3
479
480  vnclip.wi v16, v16, 12
481  vnclip.wi v18, v18, 12
482  vnclip.wi v20, v20, 12
483  vnclip.wi v22, v22, 12
484
485  vssub.vv \o7, v22, v20
486  vsadd.vv v22, v22, v20
487  vssub.vv \o1, v16, v18
488  vsadd.vv v16, v16, v18
489
490  li t2, 2896
491
492  vwmul.vx v18, \o7, t2
493  vwmul.vx v20, \o7, t2
494  vwmacc.vx v20, t2, \o1
495  neg t2, t2
496  vwmacc.vx v18, t2, \o1
497
498  vnclip.wi v18, v18, 12
499  vnclip.wi v20, v20, 12
500
501  vssub.vv \o7, \o0, v22
502  vsadd.vv \o0, \o0, v22
503  vssub.vv v17, \o2, v20
504  vsadd.vv \o1, \o2, v20
505  vssub.vv \o5, \o4, v18
506  vsadd.vv \o2, \o4, v18
507  vssub.vv \o4, \o6, v16
508  vsadd.vv \o3, \o6, v16
509  vmv.v.v \o6, v17
510.endm
511
512.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
513  li t1, 4076
514  li t2, 401
515  li t3, 3612
516  li t4, 1931
517  li t5, 2598
518  li t6, 3166
519
520  vwmul.vx v16, v7, t1
521  neg t1, t1
522  vwmul.vx v18, v7, t2
523  vwmacc.vx v16, t2, v0
524  vwmacc.vx v18, t1, v0
525
526  vwmul.vx v20, v5, t3
527  neg t3, t3
528  vwmul.vx v22, v5, t4
529  vwmacc.vx v20, t4, v2
530  vwmacc.vx v22, t3, v2
531
532  vwmul.vx v24, v3, t5
533  neg t5, t5
534  vwmul.vx v26, v3, t6
535  vwmacc.vx v24, t6, v4
536  vwmacc.vx v26, t5, v4
537
538  li t2, 1189
539  li t3, 3920
540  li t4, 1567
541  li t5, 3784
542  li t6, 2896
543
544  vwmul.vx v28, v1, t2
545  neg t2, t2
546  vwmul.vx v30, v1, t3
547  vwmacc.vx v28, t3, v6
548  vwmacc.vx v30, t2, v6
549
550  vnclip.wi v16, v16, 12
551  vnclip.wi v18, v18, 12
552  vnclip.wi v20, v20, 12
553  vnclip.wi v22, v22, 12
554  vnclip.wi v24, v24, 12
555  vnclip.wi v26, v26, 12
556  vnclip.wi v28, v28, 12
557  vnclip.wi v30, v30, 12
558
559  vssub.vv  v4, v16, v24
560  vsadd.vv v16, v16, v24
561  vsadd.vv  v1, v18, v26
562  vsadd.vv  v2, v20, v28
563  vsadd.vv  v3, v22, v30
564  vssub.vv  v5, v18, v26
565  vssub.vv  v6, v20, v28
566  vssub.vv v30, v22, v30
567
568  vsadd.vv \o0, v16, v2
569  vsadd.vv \o7,  v1, v3
570  vssub.vv  v2, v16, v2
571  vssub.vv  v3,  v1, v3
572
573  vwmul.vx v16,  v4, t5
574  vwmul.vx v18,  v4, t4
575  vwmul.vx v20, v30, t5
576  vwmul.vx v22, v30, t4
577  vwmacc.vx v16, t4, v5
578  neg t4, t4
579  vwmacc.vx v22, t5, v6
580  neg t5, t5
581  vwmacc.vx v20, t4, v6
582  vwmacc.vx v18, t5, v5
583
584  vnclip.wi v16, v16, 12
585  vnclip.wi v18, v18, 12
586  vnclip.wi v20, v20, 12
587  vnclip.wi v22, v22, 12
588
589  vsadd.vv \o1, v16, v20
590  vsadd.vv \o6, v18, v22
591  vssub.vv v16, v16, v20
592  vssub.vv v17, v18, v22
593
594  vwmul.vx v18, v2, t6
595  vwmul.vx v20, v2, t6
596  vwmul.vx v22, v16, t6
597  vwmul.vx v24, v16, t6
598  vwmacc.vx v18, t6, v3
599  vwmacc.vx v22, t6, v17
600  neg t6, t6
601  vwmacc.vx v20, t6, v3
602  vwmacc.vx v24, t6, v17
603
604  vnclip.wi \o3, v18, 12
605  vnclip.wi \o4, v20, 12
606  vnclip.wi \o2, v22, 12
607  vnclip.wi \o5, v24, 12
608
609  vmv.v.x v16, zero
610  vssub.vv \o1, v16, \o1
611  vssub.vv \o3, v16, \o3
612  vssub.vv \o5, v16, \o5
613  vssub.vv \o7, v16, \o7
614.endm
615
616function inv_dct_e16_x8_rvv, export=1, ext=v
617  idct_8 v0, v1, v2, v3, v4, v5, v6, v7
618  jr t0
619endfunc
620
621function inv_adst_e16_x8_rvv, export=1, ext=v
622  iadst_8 v0, v1, v2, v3, v4, v5, v6, v7
623  jr t0
624endfunc
625
626function inv_flipadst_e16_x8_rvv, export=1, ext=v
627  iadst_8 v7, v6, v5, v4, v3, v2, v1, v0
628  jr t0
629endfunc
630
631.macro def_fn_8x8 txfm1, txfm2
632function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v
633.ifc \txfm1\()_\txfm2, dct_dct
634  beqz a3, 1f
635.endif
636  la a5, inv_\txfm2\()_e16_x8_rvv
637.ifc \txfm1, identity
638  j inv_txfm_identity_add_8x8_rvv
639.else
640  la a4, inv_\txfm1\()_e16_x8_rvv
641  j inv_txfm_add_8x8_rvv
642.endif
643.ifc \txfm1\()_\txfm2, dct_dct
6441:
645  csrw vxrm, zero
646  vsetivli zero, 8, e16, m1, ta, ma
647  ld t2, (a2)
648  li t1, 2896*8
649  vmv.v.x v0, t2
650  vsmul.vx v0, v0, t1
651  sd x0, (a2)
652  vssra.vi v0, v0, 1
653  vsmul.vx v0, v0, t1
654  vssra.vi v0, v0, 4
655  vmv.v.v v1, v0
656  vmv.v.v v2, v0
657  vmv.v.v v3, v0
658  vmv.v.v v4, v0
659  vmv.v.v v5, v0
660  vmv.v.v v6, v0
661  vmv.v.v v7, v0
662  j itx_8x8_end
663.endif
664endfunc
665.endm
666
667def_fn_8x8 dct, dct
668def_fn_8x8 identity, identity
669def_fn_8x8 dct, adst
670def_fn_8x8 dct, flipadst
671def_fn_8x8 dct, identity
672def_fn_8x8 adst, dct
673def_fn_8x8 adst, adst
674def_fn_8x8 adst, flipadst
675def_fn_8x8 flipadst, dct
676def_fn_8x8 flipadst, adst
677def_fn_8x8 flipadst, flipadst
678def_fn_8x8 identity, dct
679def_fn_8x8 adst, identity
680def_fn_8x8 flipadst, identity
681def_fn_8x8 identity, adst
682def_fn_8x8 identity, flipadst
683
684function inv_txfm_add_4x8_rvv, export=1, ext=v
685  csrw vxrm, zero
686
687  vsetivli zero, 8, e16, m1, ta, ma
688  vle16.v v0, (a2)
689  addi t0, a2, 16
690  vle16.v v1, (t0)
691  addi t0, t0, 16
692  vle16.v v2, (t0)
693  addi t0, t0, 16
694  vle16.v v3, (t0)
695
696  li t1, 2896*8
697.irp i, 0, 1, 2, 3
698  vsmul.vx v\i, v\i, t1
699.endr
700
701  jalr t0, a4
702
703  vsseg4e16.v v0, (a2)
704
705  vsetivli zero, 4, e16, mf2, ta, ma
706  vmv.v.x v8, zero
707  vle16.v v0, (a2)
708  vse16.v v8, (a2)
709.irp i, 1, 2, 3, 4, 5, 6, 7
710  addi a2, a2, 8
711  vle16.v v\i, (a2)
712  vse16.v v8, (a2)
713.endr
714
715  jalr t0, a5
716
717.irp i, 0, 1, 2, 3, 4, 5, 6, 7
718  vssra.vi v\i, v\i, 4
719.endr
720
721  vsetvli zero, zero, e8, mf4, ta, ma
722  vle8.v v8, (a0)
723  add t0, a0, a1
724  vle8.v v9, (t0)
725.irp i, 10, 11, 12, 13, 14, 15
726  add t0, t0, a1
727  vle8.v v\i, (t0)
728.endr
729
730  vwaddu.wv v0, v0,  v8
731  vwaddu.wv v1, v1,  v9
732  vwaddu.wv v2, v2, v10
733  vwaddu.wv v3, v3, v11
734  vwaddu.wv v4, v4, v12
735  vwaddu.wv v5, v5, v13
736  vwaddu.wv v6, v6, v14
737  vwaddu.wv v7, v7, v15
738
739  vsetvli zero, zero, e16, mf2, ta, ma
740.irp i, 0, 1, 2, 3, 4, 5, 6, 7
741  vmax.vx v\i, v\i, zero
742.endr
743
744  vsetvli zero, zero, e8, mf4, ta, ma
745
746  vnclipu.wi  v8, v0, 0
747  vnclipu.wi  v9, v1, 0
748  vnclipu.wi v10, v2, 0
749  vnclipu.wi v11, v3, 0
750  vnclipu.wi v12, v4, 0
751  vnclipu.wi v13, v5, 0
752  vnclipu.wi v14, v6, 0
753  vnclipu.wi v15, v7, 0
754
755  vse8.v v8, (a0)
756.irp i, 9, 10, 11, 12, 13, 14, 15
757  add a0, a0, a1
758  vse8.v v\i, (a0)
759.endr
760
761  ret
762endfunc
763
764function inv_txfm_add_8x4_rvv, export=1, ext=v
765  csrw vxrm, zero
766
767  vsetivli zero, 4, e16, mf2, ta, ma
768  vle16.v v0, (a2)
769  addi t0, a2, 8
770  vle16.v v1, (t0)
771.irp i, 2, 3, 4, 5, 6, 7
772  addi t0, t0, 8
773  vle16.v v\i, (t0)
774.endr
775
776  li t1, 2896*8
777.irp i, 0, 1, 2, 3, 4, 5, 6, 7
778  vsmul.vx v\i, v\i, t1
779.endr
780
781  jalr t0, a4
782
783  vsseg8e16.v v0, (a2)
784
785  vsetivli zero, 8, e16, m1, ta, ma
786  vmv.v.x v4, zero
787  vle16.v v0, (a2)
788  vse16.v v4, (a2)
789.irp i, 1, 2, 3
790  addi a2, a2, 16
791  vle16.v v\i, (a2)
792  vse16.v v4, (a2)
793.endr
794
795  jalr t0, a5
796
797  vssra.vi v0, v0, 4
798  vssra.vi v1, v1, 4
799  vssra.vi v2, v2, 4
800  vssra.vi v3, v3, 4
801
802  vsetvli zero, zero, e8, mf2, ta, ma
803  vle8.v v4, (a0)
804  add t0, a0, a1
805  vle8.v v5, (t0)
806  add t0, t0, a1
807  vle8.v v6, (t0)
808  add t0, t0, a1
809  vle8.v v7, (t0)
810
811  vwaddu.wv v0, v0, v4
812  vwaddu.wv v1, v1, v5
813  vwaddu.wv v2, v2, v6
814  vwaddu.wv v3, v3, v7
815
816  vsetvli zero, zero, e16, m1, ta, ma
817  vmax.vx v0, v0, zero
818  vmax.vx v1, v1, zero
819  vmax.vx v2, v2, zero
820  vmax.vx v3, v3, zero
821
822  vsetvli zero, zero, e8, mf2, ta, ma
823
824  vnclipu.wi v4, v0, 0
825  vnclipu.wi v5, v1, 0
826  vnclipu.wi v6, v2, 0
827  vnclipu.wi v7, v3, 0
828
829  vse8.v v4, (a0)
830  add a0, a0, a1
831  vse8.v v5, (a0)
832  add a0, a0, a1
833  vse8.v v6, (a0)
834  add a0, a0, a1
835  vse8.v v7, (a0)
836
837  ret
838endfunc
839
840/* Define symbols added in .if statement */
841.equ dct, 1
842.equ identity, 2
843.equ adst, 3
844.equ flipadst, 4
845
846.macro def_fn_48 w, h, txfm1, txfm2
847function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
848.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst)
849  la a4, inv_\txfm1\()_e16_x\w\()w_rvv
850.else
851  la a4, inv_\txfm1\()_e16_x\w\()_rvv
852.endif
853.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst)
854  la a5, inv_\txfm2\()_e16_x\h\()w_rvv
855.else
856  la a5, inv_\txfm2\()_e16_x\h\()_rvv
857.endif
858  j inv_txfm_add_\w\()x\h\()_rvv
859endfunc
860.endm
861
862.macro def_fns_48 w, h
863def_fn_48 \w, \h, dct, dct
864def_fn_48 \w, \h, identity, identity
865def_fn_48 \w, \h, dct, adst
866def_fn_48 \w, \h, dct, flipadst
867def_fn_48 \w, \h, dct, identity
868def_fn_48 \w, \h, adst, dct
869def_fn_48 \w, \h, adst, adst
870def_fn_48 \w, \h, adst, flipadst
871def_fn_48 \w, \h, flipadst, dct
872def_fn_48 \w, \h, flipadst, adst
873def_fn_48 \w, \h, flipadst, flipadst
874def_fn_48 \w, \h, identity, dct
875def_fn_48 \w, \h, adst, identity
876def_fn_48 \w, \h, flipadst, identity
877def_fn_48 \w, \h, identity, adst
878def_fn_48 \w, \h, identity, flipadst
879.endm
880
881def_fns_48 4, 8
882def_fns_48 8, 4
883
884function inv_identity_e16_x16_rvv, export=1, ext=v
885  li t1, 2*(5793-4096)*8
886.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
887  vsmul.vx v16, v\i, t1
888  vsadd.vv v\i, v\i, v\i
889  vsadd.vv v\i, v\i, v16
890.endr
891  jr t0
892endfunc
893
894function inv_dct_e16_x16_rvv, export=1, ext=v
895  idct_8 v0, v2, v4, v6, v8, v10, v12, v14
896
897  li t1, 401
898  li t2, 4076
899  li t3, 3166
900  li t4, 2598
901
902  vwmul.vx v30, v1, t2
903  neg t2, t2
904  vwmul.vx v16, v1, t1
905  vwmacc.vx v30, t1, v15
906  vwmacc.vx v16, t2, v15
907
908  vwmul.vx v28, v9, t4
909  neg t4, t4
910  vwmul.vx v18, v9, t3
911  vwmacc.vx v28, t3, v7
912  vwmacc.vx v18, t4, v7
913
914  li t1, 1931
915  li t2, 3612
916  li t3, 3920
917  li t4, 1189
918
919  vwmul.vx v26, v5, t2
920  neg t2, t2
921  vwmul.vx v20, v5, t1
922  vwmacc.vx v26, t1, v11
923  vwmacc.vx v20, t2, v11
924
925  vwmul.vx v24, v13, t4
926  neg t4, t4
927  vwmul.vx v22, v13, t3
928  vwmacc.vx v24, t3, v3
929  vwmacc.vx v22, t4, v3
930
931  li t2, 2896
932  li t3, 1567
933  li t4, 3784
934
935  vnclip.wi v16, v16, 12
936  vnclip.wi v18, v18, 12
937  vnclip.wi v20, v20, 12
938  vnclip.wi v22, v22, 12
939  vnclip.wi v24, v24, 12
940  vnclip.wi v26, v26, 12
941  vnclip.wi v28, v28, 12
942  vnclip.wi v30, v30, 12
943
944  vssub.vv  v3, v16, v18
945  vsadd.vv v16, v16, v18
946  vssub.vv  v5, v22, v20
947  vsadd.vv v22, v22, v20
948  vssub.vv v11, v24, v26
949  vsadd.vv v24, v24, v26
950  vssub.vv v13, v30, v28
951  vsadd.vv v30, v30, v28
952
953  vwmul.vx v28, v13, t4
954  neg t4, t4
955  vwmul.vx v18, v13, t3
956  vwmul.vx v26, v11, t3
957  vwmacc.vx v28, t3, v3
958  neg t3, t3
959  vwmul.vx v20, v11, t4
960  vwmacc.vx v18, t4, v3
961  vwmacc.vx v20, t3, v5
962  vwmacc.vx v26, t4, v5
963
964  vnclip.wi v18, v18, 12
965  vnclip.wi v20, v20, 12
966  vnclip.wi v26, v26, 12
967  vnclip.wi v28, v28, 12
968
969  vssub.vv  v5, v18, v20
970  vsadd.vv v18, v18, v20
971  vssub.vv v11, v28, v26
972  vsadd.vv v28, v28, v26
973
974  vssub.vv  v7, v16, v22
975  vsadd.vv v16, v16, v22
976  vssub.vv  v9, v30, v24
977  vsadd.vv v30, v30, v24
978
979  vwmul.vx v20, v11, t2
980  vwmul.vx v22,  v9, t2
981  vwmul.vx v24,  v9, t2
982  vwmul.vx v26, v11, t2
983  vwmacc.vx v24, t2, v7
984  vwmacc.vx v26, t2, v5
985  neg t2, t2
986  vwmacc.vx v20, t2, v5
987  vwmacc.vx v22, t2, v7
988
989  vnclip.wi v20, v20, 12
990  vnclip.wi v22, v22, 12
991  vnclip.wi v24, v24, 12
992  vnclip.wi v26, v26, 12
993
994  vssub.vv v15,  v0, v30
995  vsadd.vv  v0,  v0, v30
996  vssub.vv v17,  v2, v28
997  vsadd.vv  v1,  v2, v28
998  vssub.vv v13,  v4, v26
999  vsadd.vv  v2,  v4, v26
1000  vssub.vv v19,  v6, v24
1001  vsadd.vv  v3,  v6, v24
1002  vssub.vv v11,  v8, v22
1003  vsadd.vv  v4,  v8, v22
1004  vsadd.vv  v5, v10, v20
1005  vssub.vv v10, v10, v20
1006  vssub.vv  v9, v12, v18
1007  vsadd.vv  v6, v12, v18
1008  vssub.vv  v8, v14, v16
1009  vsadd.vv  v7, v14, v16
1010  vmv.v.v v14, v17
1011  vmv.v.v v12, v19
1012
1013  jr t0
1014endfunc
1015
1016.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
1017  li t1, 4091
1018  li t2, 201
1019  li t3, 3973
1020  li t4, 995
1021
1022  vwmul.vx v16, v15, t1
1023  neg t1, t1
1024  vwmul.vx v18, v15, t2
1025  vwmacc.vx v16, t2, v0
1026  vwmacc.vx v18, t1, v0
1027
1028  vwmul.vx v20, v13, t3
1029  neg t3, t3
1030  vwmul.vx v22, v13, t4
1031  vwmacc.vx v20, t4, v2
1032  vwmacc.vx v22, t3, v2
1033
1034  li t1, 3703
1035  li t2, 1751
1036  li t3, 3290
1037  li t4, 2440
1038
1039  vwmul.vx v24, v11, t1
1040  neg t1, t1
1041  vwmul.vx v26, v11, t2
1042  vwmacc.vx v24, t2, v4
1043  vwmacc.vx v26, t1, v4
1044
1045  vwmul.vx v28, v9, t3
1046  neg t3, t3
1047  vwmul.vx v30, v9, t4
1048  vwmacc.vx v28, t4, v6
1049  vwmacc.vx v30, t3, v6
1050
1051  vnclip.wi  v0, v16, 12
1052  vnclip.wi v18, v18, 12
1053  vnclip.wi  v2, v20, 12
1054  vnclip.wi v22, v22, 12
1055  vnclip.wi  v4, v24, 12
1056  vnclip.wi v26, v26, 12
1057  vnclip.wi  v6, v28, 12
1058  vnclip.wi v30, v30, 12
1059
1060  li t1, 2751
1061  li t2, 3035
1062  li t3, 2106
1063  li t4, 3513
1064
1065  vwmul.vx v16, v7, t1
1066  neg t1, t1
1067  vwmul.vx v20, v7, t2
1068  vwmacc.vx v16, t2, v8
1069  vwmacc.vx v20, t1, v8
1070
1071  vwmul.vx v24, v5, t3
1072  neg t3, t3
1073  vwmul.vx v28, v5, t4
1074  vwmacc.vx v24, t4, v10
1075  vwmacc.vx v28, t3, v10
1076
1077  vnclip.wi v16, v16, 12
1078  vnclip.wi  v9, v20, 12
1079  vnclip.wi v24, v24, 12
1080  vnclip.wi v11, v28, 12
1081
1082  vssub.vv  v8,  v0, v16
1083  vsadd.vv  v0,  v0, v16
1084  vssub.vv v10,  v2, v24
1085  vsadd.vv  v2,  v2, v24
1086
1087  li t1, 1380
1088  li t2, 3857
1089  li t3, 601
1090  li t4, 4052
1091
1092  vwmul.vx v16, v3, t1
1093  neg t1, t1
1094  vwmul.vx v20, v3, t2
1095  vwmacc.vx v16, t2, v12
1096  vwmacc.vx v20, t1, v12
1097
1098  vwmul.vx v24, v1, t3
1099  neg t3, t3
1100  vwmul.vx v28, v1, t4
1101  vwmacc.vx v24, t4, v14
1102  vwmacc.vx v28, t3, v14
1103
1104  vnclip.wi v16, v16, 12
1105  vnclip.wi v13, v20, 12
1106  vnclip.wi v24, v24, 12
1107  vnclip.wi v15, v28, 12
1108
1109  vssub.vv v12,  v4, v16
1110  vsadd.vv v16,  v4, v16
1111  vssub.vv v14,  v6, v24
1112  vsadd.vv v20,  v6, v24
1113
1114  vsadd.vv  v1, v18,  v9
1115  vssub.vv  v9, v18,  v9
1116  vsadd.vv  v3, v22, v11
1117  vssub.vv v11, v22, v11
1118  vsadd.vv v18, v26, v13
1119  vssub.vv v13, v26, v13
1120  vsadd.vv v22, v30, v15
1121  vssub.vv v15, v30, v15
1122
1123  vssub.vv v4, v0, v16
1124  vsadd.vv v0, v0, v16
1125  vssub.vv v5, v1, v18
1126  vsadd.vv v1, v1, v18
1127  vssub.vv v6, v2, v20
1128  vsadd.vv v2, v2, v20
1129  vssub.vv v7, v3, v22
1130  vsadd.vv v3, v3, v22
1131
1132  li t1, 799
1133  li t2, 4017
1134  li t3, 3406
1135  li t4, 2276
1136
1137  vwmul.vx v16,  v8, t2
1138  vwmul.vx v18,  v8, t1
1139  vwmul.vx v20, v10, t4
1140  vwmul.vx v22, v10, t3
1141  vwmul.vx v24, v13, t2
1142  vwmul.vx v26, v13, t1
1143  vwmul.vx v28, v15, t4
1144  vwmul.vx v30, v15, t3
1145  vwmacc.vx v16, t1,  v9
1146  neg t1, t1
1147  vwmacc.vx v20, t3, v11
1148  neg t3, t3
1149  vwmacc.vx v26, t2, v12
1150  neg t2, t2
1151  vwmacc.vx v30, t4, v14
1152  neg t4, t4
1153  vwmacc.vx v18, t2,  v9
1154  vwmacc.vx v22, t4, v11
1155  vwmacc.vx v24, t1, v12
1156  vwmacc.vx v28, t3, v14
1157
1158  li t2, 2896
1159  li t3, 1567
1160  li t4, 3784
1161
1162  vnclip.wi v16, v16, 12
1163  vnclip.wi v18, v18, 12
1164  vnclip.wi v20, v20, 12
1165  vnclip.wi v22, v22, 12
1166  vnclip.wi v24, v24, 12
1167  vnclip.wi v26, v26, 12
1168  vnclip.wi v28, v28, 12
1169  vnclip.wi v30, v30, 12
1170
1171  vsadd.vv  v8, v16, v24
1172  vsadd.vv  v9, v18, v26
1173  vsadd.vv v10, v20, v28
1174  vsadd.vv v11, v22, v30
1175  vssub.vv v12, v16, v24
1176  vssub.vv v13, v18, v26
1177  vssub.vv v14, v20, v28
1178  vssub.vv v15, v22, v30
1179
1180  vwmul.vx v16,  v4, t4
1181  vwmul.vx v18,  v4, t3
1182  vwmul.vx v20,  v7, t4
1183  vwmul.vx v22,  v7, t3
1184  vwmul.vx v24, v12, t4
1185  vwmul.vx v26, v12, t3
1186  vwmul.vx v28, v15, t4
1187  vwmul.vx v30, v15, t3
1188  vwmacc.vx v16, t3,  v5
1189  vwmacc.vx v22, t4,  v6
1190  vwmacc.vx v24, t3, v13
1191  neg t3, t3
1192  vwmacc.vx v30, t4, v14
1193  neg t4, t4
1194  vwmacc.vx v20, t3,  v6
1195  vwmacc.vx v28, t3, v14
1196  vwmacc.vx v18, t4,  v5
1197  vwmacc.vx v26, t4, v13
1198
1199  vnclip.wi v16, v16, 12
1200  vnclip.wi v18, v18, 12
1201  vnclip.wi v20, v20, 12
1202  vnclip.wi v22, v22, 12
1203  vnclip.wi v24, v24, 12
1204  vnclip.wi v26, v26, 12
1205  vnclip.wi v28, v28, 12
1206  vnclip.wi v30, v30, 12
1207
1208.ifc \o0, v0
1209  vsadd.vv \o14, v9, v11
1210  vssub.vv  v11, v9, v11
1211  vssub.vv   v9, v1,  v3
1212  vsadd.vv \o15, v1,  v3
1213  vsadd.vv  \o1, v8, v10
1214  vssub.vv  v10, v8, v10
1215  vssub.vv   v8, v0,  v2
1216  vsadd.vv  \o0, v0,  v2
1217.else
1218  vsadd.vv  \o1, v8, v10
1219  vssub.vv  v10, v8, v10
1220  vssub.vv   v8, v0,  v2
1221  vsadd.vv  \o0, v0,  v2
1222  vsadd.vv   v2, v9, v11
1223  vssub.vv  v11, v9, v11
1224  vssub.vv   v9, v1,  v3
1225  vsadd.vv \o15, v1,  v3
1226  vmv.v.v  \o14, v2
1227.endif
1228
1229  vsadd.vv  \o3, v16, v20
1230  vssub.vv   v6, v16, v20
1231  vsadd.vv \o12, v18, v22
1232  vssub.vv   v7, v18, v22
1233  vsadd.vv  \o2, v24, v28
1234  vssub.vv  v24, v24, v28
1235  vsadd.vv \o13, v26, v30
1236  vssub.vv  v26, v26, v30
1237
1238  neg t3, t2
1239
1240  vwmul.vx v28, v24, t2
1241  vwmul.vx v30, v24, t2
1242  vwmacc.vx v28, t2, v26
1243  vwmacc.vx v30, t3, v26
1244
1245  vwmul.vx v24, v10, t2
1246  vwmul.vx v26, v10, t2
1247  vwmacc.vx v24, t2, v11
1248  vwmacc.vx v26, t3, v11
1249
1250  vwmul.vx v20, v6, t2
1251  vwmul.vx v22, v6, t2
1252  vwmacc.vx v20, t2, v7
1253  vwmacc.vx v22, t3, v7
1254
1255  vwmul.vx v16, v8, t2
1256  vwmul.vx v18, v8, t2
1257  vwmacc.vx v16, t2, v9
1258  vwmacc.vx v18, t3, v9
1259
1260  vnclip.wi  \o7, v16, 12
1261  vnclip.wi  \o8, v18, 12
1262  vnclip.wi  \o4, v20, 12
1263  vnclip.wi \o11, v22, 12
1264  vnclip.wi  \o6, v24, 12
1265  vnclip.wi  \o9, v26, 12
1266  vnclip.wi  \o5, v28, 12
1267  vnclip.wi \o10, v30, 12
1268
1269  vmv.v.x v16, zero
1270  vssub.vv  \o1, v16,  \o1
1271  vssub.vv  \o3, v16,  \o3
1272  vssub.vv  \o5, v16,  \o5
1273  vssub.vv  \o7, v16,  \o7
1274  vssub.vv  \o9, v16,  \o9
1275  vssub.vv \o11, v16, \o11
1276  vssub.vv \o13, v16, \o13
1277  vssub.vv \o15, v16, \o15
1278.endm
1279
1280function inv_adst_e16_x16_rvv, export=1, ext=v
1281  iadst_16 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
1282  jr t0
1283endfunc
1284
1285function inv_flipadst_e16_x16_rvv, export=1, ext=v
1286  iadst_16 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0
1287  jr t0
1288endfunc
1289
1290.macro def_horz_16 variant
1291function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v
1292  vmv.v.x v16, zero
1293  vle16.v v0, (t4)
1294  vse16.v v16, (t4)
1295.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1296  add t4, t4, t6
1297  vle16.v v\i, (t4)
1298  vse16.v v16, (t4)
1299.endr
1300.ifc \variant, _identity
1301  li t1, 2*(5793-4096)*8
1302.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1303  vsmul.vx v16, v\i, t1
1304  vsra.vi v16, v16, 1
1305  vaadd.vv v\i, v\i, v16
1306.endr
1307  j L(horz_16x8_epilog)
1308.else
1309  jalr t0, a4
1310.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1311  vssra.vi v\i, v\i, 2
1312.endr
1313L(horz_16x8_epilog):
1314  vsse16.v v0, (t5), t6
1315.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1316  addi t5, t5, 2
1317  vsse16.v v\i, (t5), t6
1318.endr
1319  jr a7
1320.endif
1321endfunc
1322.endm
1323
1324def_horz_16 _identity
1325def_horz_16
1326
1327function inv_txfm_add_vert_8x16_rvv, export=1, ext=v
1328  vsetivli zero, 8, e16, m1, ta, ma
1329
1330  vle16.v v0, (t4)
1331.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1332  add t4, t4, t6
1333  vle16.v v\i, (t4)
1334.endr
1335
1336  jalr t0, a5
1337
1338.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1339  vssra.vi v\i, v\i, 4
1340.endr
1341
1342  vsetivli zero, 8, e8, mf2, ta, ma
1343
1344  vle8.v v16, (t5)
1345  add t0, t5, a1
1346  vle8.v v17, (t0)
1347.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1348  add t0, t0, a1
1349  vle8.v v\i, (t0)
1350.endr
1351
1352  vwaddu.wv v0, v0, v16
1353  vwaddu.wv v1, v1, v17
1354  vwaddu.wv v2, v2, v18
1355  vwaddu.wv v3, v3, v19
1356  vwaddu.wv v4, v4, v20
1357  vwaddu.wv v5, v5, v21
1358  vwaddu.wv v6, v6, v22
1359  vwaddu.wv v7, v7, v23
1360  vwaddu.wv v8, v8, v24
1361  vwaddu.wv v9, v9, v25
1362  vwaddu.wv v10, v10, v26
1363  vwaddu.wv v11, v11, v27
1364  vwaddu.wv v12, v12, v28
1365  vwaddu.wv v13, v13, v29
1366  vwaddu.wv v14, v14, v30
1367  vwaddu.wv v15, v15, v31
1368
1369  vsetvli zero, zero, e16, m1, ta, ma
1370.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1371  vmax.vx v\i, v\i, zero
1372.endr
1373
1374  vsetvli zero, zero, e8, mf2, ta, ma
1375  vnclipu.wi v16, v0, 0
1376  vnclipu.wi v17, v1, 0
1377  vnclipu.wi v18, v2, 0
1378  vnclipu.wi v19, v3, 0
1379  vnclipu.wi v20, v4, 0
1380  vnclipu.wi v21, v5, 0
1381  vnclipu.wi v22, v6, 0
1382  vnclipu.wi v23, v7, 0
1383  vnclipu.wi v24, v8, 0
1384  vnclipu.wi v25, v9, 0
1385  vnclipu.wi v26, v10, 0
1386  vnclipu.wi v27, v11, 0
1387  vnclipu.wi v28, v12, 0
1388  vnclipu.wi v29, v13, 0
1389  vnclipu.wi v30, v14, 0
1390  vnclipu.wi v31, v15, 0
1391
1392  vse8.v v16, (t5)
1393.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1394  add t5, t5, a1
1395  vse8.v v\i, (t5)
1396.endr
1397
1398  jr a7
1399endfunc
1400
1401function inv_txfm_add_16x16_rvv, export=1, ext=v
1402  csrw vxrm, zero
1403  vsetivli zero, 8, e16, m1, ta, ma
1404  addi sp, sp, -16*32
1405.irp i, 8, 0
1406  addi t4, a2, \i*2
1407  addi t5, sp, \i*16*2
1408.if \i == 8
1409  blt a3, a7, 1f
1410.endif
1411  li t6, 16*2
1412  jalr a7, a6
1413.if \i == 8
1414  j 2f
14151:
1416  li t1, 64
1417  vsetvli zero, t1, e16, m8, ta, ma
1418  vmv.v.x v0, zero
1419  vse16.v v0, (t5)
1420  addi t5, t5, 128
1421  vse16.v v0, (t5)
1422  vsetivli zero, 8, e16, m1, ta, ma
14232:
1424.endif
1425.endr
1426.irp i, 0, 8
1427  addi t4, sp, \i*2
1428  addi t5, a0, \i
1429  li t6, 16*2
1430  jal a7, inv_txfm_add_vert_8x16_rvv
1431.endr
1432  addi sp, sp, 16*32
1433  ret
1434endfunc
1435
1436.macro def_fn_16x16 txfm1, txfm2, eob_half
1437function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v
1438.ifc \txfm1\()_\txfm2, dct_dct
1439 beqz a3, 1f
1440.endif
1441.ifc \txfm1, identity
1442  la a6, inv_txfm_horz_identity_16x8_rvv
1443.else
1444  la a6, inv_txfm_horz_16x8_rvv
1445  la a4, inv_\txfm1\()_e16_x16_rvv
1446.endif
1447  la a5, inv_\txfm2\()_e16_x16_rvv
1448  li a7, \eob_half
1449  j inv_txfm_add_16x16_rvv
1450.ifc \txfm1\()_\txfm2, dct_dct
14511:
1452  csrw vxrm, zero
1453  vsetivli zero, 16, e16, m2, ta, ma
1454  lh t2, (a2)
1455  li t3, 2896*8
1456  li t4, 1<<14
1457  li t5, 0xFFFF
1458  li t6, -0x10000
1459
1460  sh x0, (a2)
1461
1462  mul t2, t2, t3
1463  add t2, t2, t4
1464  srai t2, t2, 15
1465  ble t2, t5, 3f
1466  mv t2, t5
14673:
1468  ble t6, t2, 4f
1469  mv t2, t6
14704:
1471  addi t2, t2, 2
1472  srai t2, t2, 2
1473  mul t2, t2, t3
1474  add t2, t2, t4
1475  srai t2, t2, 15
1476  ble t2, t5, 5f
1477  mv t2, t5
14785:
1479  ble t6, t2, 6f
1480  mv t2, t6
14816:
1482  addi t2, t2, 8
1483  srai t2, t2, 4
1484  vmv.v.x v24, t2
1485
1486  vsetvli zero, zero, e8, m1, ta, ma
1487  add t2, a1, a1
1488  li t3, 16
14892:
1490  add t0, a0, a1
1491  vle8.v v16, (a0)
1492  vle8.v v17, (t0)
1493
1494  vwaddu.wv v0, v24, v16
1495  vwaddu.wv v2, v24, v17
1496
1497  addi t3, t3, -2 # loop counter
1498
1499
1500  vsetvli zero, zero, e16, m2, ta, ma
1501.irp i, 0, 2
1502  vmax.vx v\i, v\i, zero
1503.endr
1504
1505  vsetvli zero, zero, e8, m1, ta, ma
1506
1507  vnclipu.wi  v16, v0, 0
1508  vnclipu.wi  v17, v2, 0
1509
1510  add t0, a0, a1
1511  vse8.v v16, (a0)
1512  add a0, a0, t2
1513  vse8.v v17, (t0)
1514
1515  bnez t3, 2b
1516
1517  ret
1518.endif
1519endfunc
1520.endm
1521
1522def_fn_16x16 dct, dct, 36
1523def_fn_16x16 identity, identity, 36
1524def_fn_16x16 dct, adst, 36
1525def_fn_16x16 dct, flipadst, 36
1526def_fn_16x16 dct, identity, 8
1527def_fn_16x16 adst, dct, 36
1528def_fn_16x16 adst, adst, 36
1529def_fn_16x16 adst, flipadst, 36
1530def_fn_16x16 flipadst, dct, 36
1531def_fn_16x16 flipadst, adst, 36
1532def_fn_16x16 flipadst, flipadst, 36
1533def_fn_16x16 identity, dct, 8
1534
1535.macro def_fn_416_base variant
1536function inv_txfm_\variant\()add_4x16_rvv, export=1, ext=v
1537  csrw vxrm, zero
1538
1539  vsetivli zero, 8, e16, m1, ta, ma
1540
1541  blt a3, a6, 1f
1542
1543  addi t0, a2, 16
1544  vle16.v v0, (t0)
1545  addi t0, t0, 32
1546  vle16.v v1, (t0)
1547  addi t0, t0, 32
1548  vle16.v v2, (t0)
1549  addi t0, t0, 32
1550  vle16.v v3, (t0)
1551
1552.ifc \variant, identity_
1553  li t1, (5793-4096)*8
1554  vsmul.vx v8, v0, t1
1555  vaadd.vv v4, v0, v8
1556  vsmul.vx v8, v1, t1
1557  vaadd.vv v5, v1, v8
1558  vsmul.vx v8, v2, t1
1559  vaadd.vv v6, v2, v8
1560  vsmul.vx v8, v3, t1
1561  vaadd.vv v7, v3, v8
1562.else
1563  jalr t0, a4
1564
1565  vssra.vi v4, v0, 1
1566  vssra.vi v5, v1, 1
1567  vssra.vi v6, v2, 1
1568  vssra.vi v7, v3, 1
1569.endif
1570
1571  j 2f
1572
15731:
1574.irp i, 4, 5, 6, 7
1575  vmv.v.x v\i, zero
1576.endr
1577
15782:
1579  vle16.v v0, (a2)
1580  addi t0, a2, 32
1581  vle16.v v1, (t0)
1582  addi t0, t0, 32
1583  vle16.v v2, (t0)
1584  addi t0, t0, 32
1585  vle16.v v3, (t0)
1586
1587.ifc \variant, identity_
1588  li t1, (5793-4096)*8
1589.irp i, 0, 1, 2, 3
1590  vsmul.vx v8, v\i, t1
1591  vaadd.vv v\i, v\i, v8
1592.endr
1593
1594  j L(itx_4x16_epilog)
1595.else
1596  jalr t0, a4
1597
1598  vssra.vi v0, v0, 1
1599  vssra.vi v1, v1, 1
1600  vssra.vi v2, v2, 1
1601  vssra.vi v3, v3, 1
1602
1603L(itx_4x16_epilog):
1604  vsseg4e16.v v0, (a2)
1605  addi t0, a2, 64
1606  vsseg4e16.v v4, (t0)
1607
1608  vsetivli zero, 4, e16, mf2, ta, ma
1609
1610  vmv.v.x v16, zero
1611  vle16.v v0, (a2)
1612  vse16.v v16, (a2)
1613  addi t0, a2, 8
1614  vle16.v v1, (t0)
1615  vse16.v v16, (t0)
1616.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1617  addi t0, t0, 8
1618  vle16.v v\i, (t0)
1619  vse16.v v16, (t0)
1620.endr
1621
1622  jalr t0, a5
1623
1624.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1625  vssra.vi v\i, v\i, 4
1626.endr
1627
1628  vsetvli zero, zero, e8, mf4, ta, ma
1629
1630  vle8.v v16, (a0)
1631  add t0, a0, a1
1632  vle8.v v17, (t0)
1633.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1634  add t0, t0, a1
1635  vle8.v v\i, (t0)
1636.endr
1637
1638  vwaddu.wv  v0,  v0, v16
1639  vwaddu.wv  v1,  v1, v17
1640  vwaddu.wv  v2,  v2, v18
1641  vwaddu.wv  v3,  v3, v19
1642  vwaddu.wv  v4,  v4, v20
1643  vwaddu.wv  v5,  v5, v21
1644  vwaddu.wv  v6,  v6, v22
1645  vwaddu.wv  v7,  v7, v23
1646  vwaddu.wv  v8,  v8, v24
1647  vwaddu.wv  v9,  v9, v25
1648  vwaddu.wv v10, v10, v26
1649  vwaddu.wv v11, v11, v27
1650  vwaddu.wv v12, v12, v28
1651  vwaddu.wv v13, v13, v29
1652  vwaddu.wv v14, v14, v30
1653  vwaddu.wv v15, v15, v31
1654
1655  vsetvli zero, zero, e16, mf2, ta, ma
1656.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1657  vmax.vx v\i, v\i, zero
1658.endr
1659
1660  vsetvli zero, zero, e8, mf4, ta, ma
1661
1662  vnclipu.wi v16,  v0, 0
1663  vnclipu.wi v17,  v1, 0
1664  vnclipu.wi v18,  v2, 0
1665  vnclipu.wi v19,  v3, 0
1666  vnclipu.wi v20,  v4, 0
1667  vnclipu.wi v21,  v5, 0
1668  vnclipu.wi v22,  v6, 0
1669  vnclipu.wi v23,  v7, 0
1670  vnclipu.wi v24,  v8, 0
1671  vnclipu.wi v25,  v9, 0
1672  vnclipu.wi v26, v10, 0
1673  vnclipu.wi v27, v11, 0
1674  vnclipu.wi v28, v12, 0
1675  vnclipu.wi v29, v13, 0
1676  vnclipu.wi v30, v14, 0
1677  vnclipu.wi v31, v15, 0
1678
1679  vse8.v v16, (a0)
1680.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1681  add a0, a0, a1
1682  vse8.v v\i, (a0)
1683.endr
1684
1685  ret
1686.endif
1687endfunc
1688
1689function inv_txfm_\variant\()add_16x4_rvv, export=1, ext=v
1690  csrw vxrm, zero
1691
1692  vsetivli zero, 4, e16, mf2, ta, ma
1693  vle16.v v0, (a2)
1694  addi t0, a2, 8
1695  vle16.v v1, (t0)
1696.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1697  addi t0, t0, 8
1698  vle16.v v\i, (t0)
1699.endr
1700
1701.ifc \variant, identity_
1702  li t1, 2*(5793-4096)*8
1703.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1704  vsmul.vx v16, v\i, t1
1705  vssra.vi v16, v16, 1
1706  vsadd.vv v\i, v\i, v16
1707.endr
1708
1709  j L(itx_16x4_epilog)
1710.else
1711  jalr t0, a4
1712
1713.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1714  vssra.vi v\i, v\i, 1
1715.endr
1716
1717L(itx_16x4_epilog):
1718  li t0, 32
1719  vssseg8e16.v v0, (a2), t0
1720  addi t1, a2, 16
1721  vssseg8e16.v v8, (t1), t0
1722
1723.irp j, 0, 8
1724  vsetivli zero, 8, e16, m1, ta, ma
1725
1726  vmv.v.x v4, zero
1727  addi t0, a2, \j*2
1728  vle16.v v0, (t0)
1729  vse16.v v4, (t0)
1730.irp i, 1, 2, 3
1731  addi t0, t0, 32
1732  vle16.v v\i, (t0)
1733  vse16.v v4, (t0)
1734.endr
1735
1736  jalr t0, a5
1737
1738  vssra.vi v0, v0, 4
1739  vssra.vi v1, v1, 4
1740  vssra.vi v2, v2, 4
1741  vssra.vi v3, v3, 4
1742
1743  vsetvli zero, zero, e8, mf2, ta, ma
1744  addi t0, a0, \j
1745  vle8.v v4, (t0)
1746  add t0, t0, a1
1747  vle8.v v5, (t0)
1748  add t0, t0, a1
1749  vle8.v v6, (t0)
1750  add t0, t0, a1
1751  vle8.v v7, (t0)
1752
1753  vwaddu.wv v0, v0, v4
1754  vwaddu.wv v1, v1, v5
1755  vwaddu.wv v2, v2, v6
1756  vwaddu.wv v3, v3, v7
1757
1758  vsetvli zero, zero, e16, m1, ta, ma
1759  vmax.vx v0, v0, zero
1760  vmax.vx v1, v1, zero
1761  vmax.vx v2, v2, zero
1762  vmax.vx v3, v3, zero
1763
1764  vsetvli zero, zero, e8, mf2, ta, ma
1765
1766  vnclipu.wi v4, v0, 0
1767  vnclipu.wi v5, v1, 0
1768  vnclipu.wi v6, v2, 0
1769  vnclipu.wi v7, v3, 0
1770
1771  addi t0, a0, \j
1772  vse8.v v4, (t0)
1773  add t0, t0, a1
1774  vse8.v v5, (t0)
1775  add t0, t0, a1
1776  vse8.v v6, (t0)
1777  add t0, t0, a1
1778  vse8.v v7, (t0)
1779.endr
1780
1781  ret
1782.endif
1783endfunc
1784.endm
1785
1786def_fn_416_base identity_
1787def_fn_416_base
1788
1789.macro def_fn_416 w, h, txfm1, txfm2, eob_half
1790function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
1791.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst)
1792  la a4, inv_\txfm1\()_e16_x\w\()w_rvv
1793.elseif \txfm1 != identity
1794  la a4, inv_\txfm1\()_e16_x\w\()_rvv
1795.endif
1796.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst)
1797  la a5, inv_\txfm2\()_e16_x\h\()w_rvv
1798.else
1799  la a5, inv_\txfm2\()_e16_x\h\()_rvv
1800.endif
1801.if \w == 4
1802  li a6, \eob_half
1803.endif
1804.ifc \txfm1, identity
1805  j inv_txfm_identity_add_\w\()x\h\()_rvv
1806.else
1807  j inv_txfm_add_\w\()x\h\()_rvv
1808.endif
1809endfunc
1810.endm
1811
1812.macro def_fns_416 w, h
1813def_fn_416 \w, \h, dct, dct, 29
1814def_fn_416 \w, \h, identity, identity, 29
1815def_fn_416 \w, \h, dct, adst, 29
1816def_fn_416 \w, \h, dct, flipadst, 29
1817def_fn_416 \w, \h, dct, identity, 8
1818def_fn_416 \w, \h, adst, dct, 29
1819def_fn_416 \w, \h, adst, adst, 29
1820def_fn_416 \w, \h, adst, flipadst, 29
1821def_fn_416 \w, \h, flipadst, dct, 29
1822def_fn_416 \w, \h, flipadst, adst, 29
1823def_fn_416 \w, \h, flipadst, flipadst, 29
1824def_fn_416 \w, \h, identity, dct, 32
1825def_fn_416 \w, \h, adst, identity, 8
1826def_fn_416 \w, \h, flipadst, identity, 8
1827def_fn_416 \w, \h, identity, adst, 32
1828def_fn_416 \w, \h, identity, flipadst, 32
1829.endm
1830
1831def_fns_416 4, 16
1832def_fns_416 16, 4
1833
1834.macro def_fn_816_base variant
1835function inv_txfm_\variant\()add_8x16_rvv, export=1, ext=v
1836  csrw vxrm, zero
1837
1838  vsetivli zero, 8, e16, m1, ta, ma
1839
1840  blt a3, a6, 1f
1841
1842  vmv.v.x v16, zero
1843  addi t0, a2, 16
1844  vle16.v v0, (t0)
1845  vse16.v v16, (t0)
1846.irp i, 1, 2, 3, 4, 5, 6, 7
1847  addi t0, t0, 32
1848  vle16.v v\i, (t0)
1849  vse16.v v16, (t0)
1850.endr
1851
1852  li t1, 2896*8
1853.ifc \variant, identity_
1854  vsmul.vx  v8, v0, t1
1855  vsmul.vx  v9, v1, t1
1856  vsmul.vx v10, v2, t1
1857  vsmul.vx v11, v3, t1
1858  vsmul.vx v12, v4, t1
1859  vsmul.vx v13, v5, t1
1860  vsmul.vx v14, v6, t1
1861  vsmul.vx v15, v7, t1
1862.else
1863.irp i, 0, 1, 2, 3, 4, 5, 6, 7
1864  vsmul.vx v\i, v\i, t1
1865.endr
1866
1867  jalr t0, a4
1868
1869  vssra.vi  v8, v0, 1
1870  vssra.vi  v9, v1, 1
1871  vssra.vi v10, v2, 1
1872  vssra.vi v11, v3, 1
1873  vssra.vi v12, v4, 1
1874  vssra.vi v13, v5, 1
1875  vssra.vi v14, v6, 1
1876  vssra.vi v15, v7, 1
1877.endif
1878
1879  j 2f
1880
18811:
1882.irp i, 8, 9, 10, 11, 12, 13, 14, 15
1883  vmv.v.x v\i, zero
1884.endr
1885
18862:
1887  vmv.v.x v16, zero
1888  vle16.v v0, (a2)
1889  vse16.v v16, (a2)
1890  addi t0, a2, 32
1891  vle16.v v1, (t0)
1892  vse16.v v16, (t0)
1893.irp i, 2, 3, 4, 5, 6, 7
1894  addi t0, t0, 32
1895  vle16.v v\i, (t0)
1896  vse16.v v16, (t0)
1897.endr
1898
1899  li t1, 2896*8
1900.irp i, 0, 1, 2, 3, 4, 5, 6, 7
1901  vsmul.vx v\i, v\i, t1
1902.endr
1903
1904.ifc \variant, identity_
1905  j L(itx_8x16_epilog)
1906.else
1907  jalr t0, a4
1908
1909.irp i, 0, 1, 2, 3, 4, 5, 6, 7
1910  vssra.vi v\i, v\i, 1
1911.endr
1912
1913L(itx_8x16_epilog):
1914  addi t4, sp, -8*32
1915  vsseg8e16.v v0, (t4)
1916  addi t0, t4, 8*16
1917  vsseg8e16.v v8, (t0)
1918
1919  mv t5, a0
1920  li t6, 16
1921  jal a7, inv_txfm_add_vert_8x16_rvv
1922
1923  ret
1924.endif
1925endfunc
1926
1927function inv_txfm_\variant\()add_16x8_rvv, export=1, ext=v
1928  csrw vxrm, zero
1929
1930  vsetivli zero, 8, e16, m1, ta, ma
1931  vle16.v v0, (a2)
1932  addi t0, a2, 16
1933  vle16.v v1, (t0)
1934.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1935  addi t0, t0, 16
1936  vle16.v v\i, (t0)
1937.endr
1938
1939  li t1, 2896*8
1940.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1941  vsmul.vx v\i, v\i, t1
1942.endr
1943
1944.ifc \variant, identity_
1945  li t1, 2*(5793-4096)*8
1946.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1947  vsmul.vx v16, v\i, t1
1948  vssra.vi v16, v16, 1
1949  vsadd.vv v\i, v\i, v16
1950.endr
1951
1952  j L(itx_16x8_epilog)
1953.else
1954  jalr t0, a4
1955
1956.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1957  vssra.vi v\i, v\i, 1
1958.endr
1959
1960L(itx_16x8_epilog):
1961  li t0, 32
1962  vssseg8e16.v v0, (a2), t0
1963  addi t1, a2, 16
1964  vssseg8e16.v v8, (t1), t0
1965
1966.irp j, 0, 8
1967  vsetivli zero, 8, e16, m1, ta, ma
1968
1969  vmv.v.x v8, zero
1970  addi t0, a2, \j*2
1971  vle16.v v0, (t0)
1972  vse16.v v8, (t0)
1973.irp i, 1, 2, 3, 4, 5, 6, 7
1974  addi t0, t0, 32
1975  vle16.v v\i, (t0)
1976  vse16.v v8, (t0)
1977.endr
1978
1979  jalr t0, a5
1980
1981.irp i, 0, 1, 2, 3, 4, 5, 6, 7
1982  vssra.vi v\i, v\i, 4
1983.endr
1984
1985  vsetvli zero, zero, e8, mf2, ta, ma
1986  addi t0, a0, \j
1987  vle8.v v8, (t0)
1988.irp i, 9, 10, 11, 12, 13, 14, 15
1989  add t0, t0, a1
1990  vle8.v v\i, (t0)
1991.endr
1992
1993  vwaddu.wv v0, v0, v8
1994  vwaddu.wv v1, v1, v9
1995  vwaddu.wv v2, v2, v10
1996  vwaddu.wv v3, v3, v11
1997  vwaddu.wv v4, v4, v12
1998  vwaddu.wv v5, v5, v13
1999  vwaddu.wv v6, v6, v14
2000  vwaddu.wv v7, v7, v15
2001
2002  vsetvli zero, zero, e16, m1, ta, ma
2003.irp i, 0, 1, 2, 3, 4, 5, 6, 7
2004  vmax.vx v\i, v\i, zero
2005.endr
2006
2007  vsetvli zero, zero, e8, mf2, ta, ma
2008
2009  vnclipu.wi  v8, v0, 0
2010  vnclipu.wi  v9, v1, 0
2011  vnclipu.wi v10, v2, 0
2012  vnclipu.wi v11, v3, 0
2013  vnclipu.wi v12, v4, 0
2014  vnclipu.wi v13, v5, 0
2015  vnclipu.wi v14, v6, 0
2016  vnclipu.wi v15, v7, 0
2017
2018  addi t0, a0, \j
2019  vse8.v v8, (t0)
2020.irp i, 9, 10, 11, 12, 13, 14, 15
2021  add t0, t0, a1
2022  vse8.v v\i, (t0)
2023.endr
2024.endr
2025
2026  ret
2027.endif
2028endfunc
2029.endm
2030
2031def_fn_816_base identity_
2032def_fn_816_base
2033
2034.macro def_fn_816 w, h, txfm1, txfm2, eob_half
2035function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
2036.ifnc \txfm1, identity
2037  la a4, inv_\txfm1\()_e16_x\w\()_rvv
2038.endif
2039  la a5, inv_\txfm2\()_e16_x\h\()_rvv
2040.if \w == 8
2041  li a6, \eob_half
2042.endif
2043.ifc \txfm1, identity
2044  j inv_txfm_identity_add_\w\()x\h\()_rvv
2045.else
2046  j inv_txfm_add_\w\()x\h\()_rvv
2047.endif
2048endfunc
2049.endm
2050
2051.macro def_fns_816 w, h
2052def_fn_816 \w, \h, dct, dct, 43
2053def_fn_816 \w, \h, identity, identity, 43
2054def_fn_816 \w, \h, dct, adst, 43
2055def_fn_816 \w, \h, dct, flipadst, 43
2056def_fn_816 \w, \h, dct, identity, 8
2057def_fn_816 \w, \h, adst, dct, 43
2058def_fn_816 \w, \h, adst, adst, 43
2059def_fn_816 \w, \h, adst, flipadst, 43
2060def_fn_816 \w, \h, flipadst, dct, 43
2061def_fn_816 \w, \h, flipadst, adst, 43
2062def_fn_816 \w, \h, flipadst, flipadst, 43
2063def_fn_816 \w, \h, identity, dct, 64
2064def_fn_816 \w, \h, adst, identity, 8
2065def_fn_816 \w, \h, flipadst, identity, 8
2066def_fn_816 \w, \h, identity, adst, 64
2067def_fn_816 \w, \h, identity, flipadst, 64
2068.endm
2069
2070def_fns_816 8, 16
2071def_fns_816 16, 8
2072