xref: /aosp_15_r20/external/libdav1d/src/loongarch/ipred.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2024, VideoLAN and dav1d authors
3 * Copyright © 2024, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/loongarch/loongson_asm.S"
29
30.macro ipred_dc_gen topleft, width, height
31    add.d          t0,      \width,  \height //dc
32    srai.d         t0,      t0,      1
33    addi.d         t3,      \topleft,1
34
35    or             t1,      zero,    zero  //data index
36    srai.d         t2,      \width,  4     //loop param
37    beqz           t2,      2f
38
391:  // width/16
40    vldx           vr0,     t3,      t1
41    vhaddw.hu.bu   vr0,     vr0,     vr0
42    vhaddw.wu.hu   vr0,     vr0,     vr0
43    vhaddw.du.wu   vr0,     vr0,     vr0
44    vhaddw.qu.du   vr0,     vr0,     vr0
45
46    vpickve2gr.du  t4,      vr0,     0
47    add.d          t0,      t0,      t4
48
49    addi.d         t1,      t1,      16
50    addi.d         t2,      t2,      -1
51    bnez           t2,      1b
52    b              4f
53
542:  // &8
55    andi           t2,      \width,  8
56    beqz           t2,      3f
57
58    vxor.v         vr0,     vr0,     vr0
59    fldx.d         f0,      t3,      t1
60
61    vhaddw.hu.bu   vr0,     vr0,     vr0
62    vhaddw.wu.hu   vr0,     vr0,     vr0
63    vhaddw.du.wu   vr0,     vr0,     vr0
64
65    vpickve2gr.du  t4,      vr0,     0
66    add.d          t0,      t0,      t4
67    addi.d         t1,      t1,      8
68    b              4f
69
703:  // &4
71    andi           t2,      \width,  4
72    beqz           t2,      4f
73
74    vxor.v         vr0,     vr0,     vr0
75    fldx.s         f0,      t3,      t1
76
77    vhaddw.hu.bu   vr0,     vr0,     vr0
78    vhaddw.wu.hu   vr0,     vr0,     vr0
79
80    vpickve2gr.wu  t4,      vr0,     0
81    add.d          t0,      t0,      t4
82    addi.d         t1,      t1,      4
83
844:
85    addi.d         t3,      \topleft,0
86    srai.d         t2,      \height, 4     //loop param
87    beqz           t2,      8f
88
897:  // height/16
90    addi.d         t3,      t3,      -16
91    vld            vr0,     t3,      0
92
93    vhaddw.hu.bu   vr0,     vr0,     vr0
94    vhaddw.wu.hu   vr0,     vr0,     vr0
95    vhaddw.du.wu   vr0,     vr0,     vr0
96    vhaddw.qu.du   vr0,     vr0,     vr0
97
98    vpickve2gr.du  t4,      vr0,     0
99    add.d          t0,      t0,      t4
100
101    addi.d         t2,      t2,      -1
102    bnez           t2,      7b
103    b              10f
104
1058:  // &8
106    andi           t2,      \height, 8
107    beqz           t2,      9f
108
109    addi.d         t3,      t3,      -8
110    vxor.v         vr0,     vr0,     vr0
111    fld.d          f0,      t3,      0
112
113    vhaddw.hu.bu   vr0,     vr0,     vr0
114    vhaddw.wu.hu   vr0,     vr0,     vr0
115    vhaddw.du.wu   vr0,     vr0,     vr0
116
117    vpickve2gr.du  t4,      vr0,     0
118    add.d          t0,      t0,      t4
119    b              10f
120
1219:  // &4
122    andi           t2,      \height, 4
123    beqz           t2,      10f
124
125    addi.d         t3,      t3,      -4
126    vxor.v         vr0,     vr0,     vr0
127    fld.s          f0,      t3,      0
128
129    vhaddw.hu.bu   vr0,     vr0,     vr0
130    vhaddw.wu.hu   vr0,     vr0,     vr0
131
132    vpickve2gr.wu  t4,      vr0,     0
133    add.d          t0,      t0,      t4
134
13510:
136    add.d          t1,      \width,  \height
137    ctz.w          t1,      t1
138    sra.w          t0,      t0,      t1
139
140    // w != h
141    beq            \width,  \height, 16f
142    add.d          t2,      \height, \height
143    add.d          t3,      \width,  \width
144    slt            t2,      t2,      \width
145    slt            t3,      t3,      \height
146    or             t2,      t2,      t3
147    li.w           t3,      0x3334
148    maskeqz        t1,      t3,      t2
149    li.w           t3,      0x5556
150    masknez        t2,      t3,      t2
151    or             t1,      t1,      t2
152    mul.w          t0,      t0,      t1
153    srai.w         t0,      t0,      16
154
15516:
156.endm
157
158.macro ipred_splat_dc dst, stride, width, height, dc
159    li.w           t1,      4
160    blt            t1,      \width,  2f
161
162    li.w           t1,      0x01010101
163    mulw.d.wu      t1,      \dc,     t1
164    beqz           \height, 7f
165    or             t2,      \dst,    \dst
1661:  // width <= 4
167    st.w           t1,      t2,      0
168    add.d          t2,      t2,      \stride
169    addi.d         \height, \height, -1
170    bnez           \height, 1b
171    b              7f
172
1732:  //width > 4
174    li.d           t1,      0x0101010101010101
175    mul.d          t1,      \dc,     t1
176    vreplgr2vr.d   vr0,     t1
177    or             t4,      \dst,    \dst
178    beqz           \height, 7f
179
1803:
181    andi           t5,      \width,  64
182    beqz           t5,      4f
183    vst            vr0,     t4,      0
184    vst            vr0,     t4,      16
185    vst            vr0,     t4,      32
186    vst            vr0,     t4,      48
187    b              6f
188
1894:
190    andi           t5,      \width,  32
191    beqz           t5,      41f
192    vst            vr0,     t4,      0
193    vst            vr0,     t4,      16
194    b              6f
195
19641:
197    andi           t5,      \width,  16
198    beqz           t5,      5f
199    vst            vr0,     t4,      0
200    b              6f
201
2025:
203    fst.d          f0,      t4,      0
204
2056:
206    add.d          t4,      t4,      \stride
207    addi.d         \height, \height, -1
208    bnez           \height, 3b
209
2107:
211.endm
212
213.macro ipred_dc_gen_top topleft, width
214    srai.d         t0,      \width,  1
215    addi.d         t1,      \topleft,1
216
217    srai.d         t2,      \width,  4
218    beqz           t2,      2f
2191:
220    vld            vr0,     t1,      0
221    vhaddw.hu.bu   vr0,     vr0,     vr0
222    vhaddw.wu.hu   vr0,     vr0,     vr0
223    vhaddw.du.wu   vr0,     vr0,     vr0
224    vhaddw.qu.du   vr0,     vr0,     vr0
225
226    vpickve2gr.du  t3,      vr0,     0
227    add.d          t0,      t0,      t3
228
229    addi.d         t1,      t1,      16
230    addi.d         t2,      t2,      -1
231    bnez           t2,      1b
232    b              4f
233
2342:  // &8
235    andi           t2,      \width,  8
236    beqz           t2,      3f
237
238    vxor.v         vr0,     vr0,     vr0
239    fld.d          f0,      t1,      0
240
241    vhaddw.hu.bu   vr0,     vr0,     vr0
242    vhaddw.wu.hu   vr0,     vr0,     vr0
243    vhaddw.du.wu   vr0,     vr0,     vr0
244
245    vpickve2gr.du  t2,      vr0,     0
246    add.d          t0,      t0,      t2
247
248    addi.d         t1,      t1,      8
249    b              4f
250
2513:  // &4
252    andi           t2,      \width,  4
253    beqz           t2,      4f
254
255    vxor.v         vr0,     vr0,     vr0
256    fld.s          f0,      t1,      0
257
258    vhaddw.hu.bu   vr0,     vr0,     vr0
259    vhaddw.wu.hu   vr0,     vr0,     vr0
260
261    vpickve2gr.du  t2,      vr0,     0
262    add.d          t0,      t0,      t2
263    addi.d         t1,      t1,      4
264
2654:
266    ctz.w          t1,      \width
267    sra.w          t0,      t0,      t1
268.endm
269
270.macro ipred_dc_gen_left topleft, height
271    srai.d         t0,      \height, 1
272    srai.d         t2,      \height, 4     //loop param
273    beqz           t2,      8f
274
2757:  // height/16
276    addi.d         \topleft,\topleft,-16
277    vld            vr0,     \topleft,0
278
279    vhaddw.hu.bu   vr0,     vr0,     vr0
280    vhaddw.wu.hu   vr0,     vr0,     vr0
281    vhaddw.du.wu   vr0,     vr0,     vr0
282    vhaddw.qu.du   vr0,     vr0,     vr0
283
284    vpickve2gr.du  t4,      vr0,     0
285    add.d          t0,      t0,      t4
286
287    addi.d         t2,      t2,      -1
288    bnez           t2,      7b
289    b              10f
290
2918:  // &8
292    andi           t2,      \height, 8
293    beqz           t2,      9f
294
295    addi.d         \topleft,\topleft,-8
296    vxor.v         vr0,     vr0,     vr0
297    fld.d          f0,      \topleft,0
298
299    vhaddw.hu.bu   vr0,     vr0,     vr0
300    vhaddw.wu.hu   vr0,     vr0,     vr0
301    vhaddw.du.wu   vr0,     vr0,     vr0
302
303    vpickve2gr.du  t4,      vr0,     0
304    add.d          t0,      t0,      t4
305    b              10f
306
3079:  // &4
308    andi           t2,      \height, 4
309    beqz           t2,      10f
310
311    addi.d         \topleft,\topleft,-4
312    vxor.v         vr0,     vr0,     vr0
313    fld.s          f0,      \topleft,0
314
315    vhaddw.hu.bu   vr0,     vr0,     vr0
316    vhaddw.wu.hu   vr0,     vr0,     vr0
317
318    vpickve2gr.wu  t4,      vr0,     0
319    add.d          t0,      t0,      t4
320
32110:
322    ctz.w          t1,      \height
323    sra.w          t0,      t0,      t1
324
325.endm
326
327// void ipred_dc_lsx(pixel *dst, const ptrdiff_t stride,
328//                   const pixel *const topleft,
329//                   const int width, const int height, const int a,
330//                   const int max_width, const int max_height
331//                   HIGHBD_DECL_SUFFIX)
332function ipred_dc_8bpc_lsx
333    ipred_dc_gen   a2, a3, a4
334    ipred_splat_dc a0, a1, a3, a4, t0
335
336endfunc
337
338// void ipred_dc_128_lsx(pixel *dst, const ptrdiff_t stride,
339//                       const pixel *const topleft,
340//                       const int width, const int height, const int a,
341//                       const int max_width, const int max_height
342//                       HIGHBD_DECL_SUFFIX)
343function ipred_dc_128_8bpc_lsx
344    li.w           t0,      128
345    ipred_splat_dc a0, a1, a3, a4, t0
346
347endfunc
348
349// void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
350//                     const pixel *const topleft,
351//                     const int width, const int height, const int a,
352//                     const int max_width, const int max_height
353//                     HIGHBD_DECL_SUFFIX)
354function ipred_dc_top_8bpc_lsx
355    ipred_dc_gen_top a2, a3
356    ipred_splat_dc   a0, a1, a3, a4, t0
357
358endfunc
359
360// void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
361//                      const pixel *const topleft,
362//                      const int width, const int height, const int a,
363//                      const int max_width, const int max_height
364//                      HIGHBD_DECL_SUFFIX)
365function ipred_dc_left_8bpc_lsx
366    ipred_dc_gen_left a2, a4
367    ipred_splat_dc    a0, a1, a3, a4, t0
368
369endfunc
370
371.macro pixel_set_8bpc dst_ptr, src_ptr, width
372    vldrepl.b      vr0,     \src_ptr, 0
3731:
374    andi           a5,      \width,   64
375    beqz           a5,      2f
376
377    vst            vr0,     \dst_ptr, 0
378    vst            vr0,     \dst_ptr, 16
379    vst            vr0,     \dst_ptr, 32
380    vst            vr0,     \dst_ptr, 48
381    b              6f
3822:
383    andi           a5,      \width,   32
384    beqz           a5,      3f
385
386    vst            vr0,     \dst_ptr, 0
387    vst            vr0,     \dst_ptr, 16
388    b              6f
3893:
390    andi           a5,      \width,   16
391    beqz           a5,      4f
392
393    vst            vr0,     \dst_ptr, 0
394    b              6f
3954:
396    andi           a5,      \width,   8
397    beqz           a5,      5f
398
399    fst.d          f0,      \dst_ptr, 0
400    b              6f
4015:
402    andi           a5,      \width,   4
403    beqz           a5,      6f
404
405    fst.s          f0,      \dst_ptr, 0
4066:
407.endm
408
409// void ipred_h_c(pixel *dst, const ptrdiff_t stride,
410//                const pixel *const topleft,
411//                const int width, const int height, const int a,
412//                const int max_width, const int max_height
413//                HIGHBD_DECL_SUFFIX)
414function ipred_h_8bpc_lsx
415    beqz           a4,      .IPRED_H_END
416.IPRED_H_LOOP:
417    addi.d         a2,      a2,      -1
418
419    pixel_set_8bpc a0, a2, a3
420
421    add.d          a0,      a0,      a1
422    addi.d         a4,      a4,      -1
423    bnez           a4,      .IPRED_H_LOOP
424
425.IPRED_H_END:
426endfunc
427
428.macro pixel_copy_8bpc dst_ptr, src_ptr, width
4291:
430    andi           a5,      \width,   64
431    beqz           a5,      2f
432
433    vld            vr0,     \src_ptr, 0
434    vld            vr1,     \src_ptr, 16
435    vld            vr2,     \src_ptr, 32
436    vld            vr3,     \src_ptr, 48
437
438    vst            vr0,     \dst_ptr, 0
439    vst            vr1,     \dst_ptr, 16
440    vst            vr2,     \dst_ptr, 32
441    vst            vr3,     \dst_ptr, 48
442
443    b              6f
4442:
445    andi           a5,      \width,   32
446    beqz           a5,      3f
447
448    vld            vr0,     \src_ptr, 0
449    vld            vr1,     \src_ptr, 16
450
451    vst            vr0,     \dst_ptr, 0
452    vst            vr1,     \dst_ptr, 16
453
454    b              6f
4553:
456    andi           a5,      \width,   16
457    beqz           a5,      4f
458
459    vld            vr0,     \src_ptr, 0
460    vst            vr0,     \dst_ptr, 0
461
462    b              6f
4634:
464    andi           a5,      \width,   8
465    beqz           a5,      5f
466
467    fld.d          f0,      \src_ptr, 0
468    fst.d          f0,      \dst_ptr, 0
469
470    b              6f
4715:
472    andi           a5,      \width,   4
473    beqz           a5,      6f
474
475    fld.s          f0,      \src_ptr, 0
476    fst.s          f0,      \dst_ptr, 0
4776:
478.endm
479
480// void ipred_v_lsx(pixel *dst, const ptrdiff_t stride,
481//                  const pixel *const topleft,
482//                  const int width, const int height, const int a,
483//                  const int max_width, const int max_height
484//                  HIGHBD_DECL_SUFFIX)
485function ipred_v_8bpc_lsx
486    beqz           a4,      .IPRED_V_END
487    addi.d         a2,      a2,      1
488.IPRED_V_LOOP:
489    pixel_copy_8bpc  a0, a2, a3
490
491    add.d          a0,      a0,      a1
492    addi.d         a4,      a4,      -1
493    bnez           a4,      .IPRED_V_LOOP
494
495.IPRED_V_END:
496endfunc
497
498// void ipred_paeth_lsx(pixel *dst, const ptrdiff_t stride,
499//                      const pixel *const tl_ptr,
500//                      const int width, const int height, const int a,
501//                      const int max_width, const int max_height
502//                      HIGHBD_DECL_SUFFIX)
503function ipred_paeth_8bpc_lsx
504    vldrepl.b      vr0,     a2,      0    //topleft
505    vsllwil.hu.bu  vr0,     vr0,     0
506    or             a6,      a2,      a2
507    addi.d         a7,      a2,      1
508
509.IPRED_PAETH_H_LOOP:
510    addi.d         a6,      a6,      -1
511    vldrepl.b      vr1,     a6,      0   //left
512    vsllwil.hu.bu  vr1,     vr1,     0
513
514.IPRED_PAETH_W_LOOP64:
515    andi           a5,      a3,      64
516    beqz           a5,      .IPRED_PAETH_W_LOOP32
517
518    vld            vr2,     a7,      0   //top
519    vpermi.w       vr9,     vr2,     0x0e
520    vsllwil.hu.bu  vr2,     vr2,     0
521    vsllwil.hu.bu  vr9,     vr9,     0
522
523    vabsd.hu       vr5,     vr0,     vr1  //tdiff
524    vabsd.hu       vr4,     vr0,     vr2  //ldiff
525    vabsd.hu       vr10,    vr0,     vr9
526
527    vadd.h         vr3,     vr0,     vr0
528    vadd.h         vr6,     vr1,     vr2
529    vadd.h         vr11,    vr1,     vr9
530    vabsd.hu       vr6,     vr3,     vr6  //tldiff
531    vabsd.hu       vr11,    vr3,     vr11 //tldiff
532
533    vsle.hu        vr3,     vr5,     vr6
534    vbitsel.v      vr7,     vr0,     vr2,    vr3
535    vsle.hu        vr3,     vr4,     vr5
536    vsle.hu        vr8,     vr4,     vr6
537    vand.v         vr3,     vr3,     vr8
538    vbitsel.v      vr3,     vr7,     vr1,    vr3
539    vsrlni.b.h     vr3,     vr3,     0
540
541    vsle.hu        vr12,    vr5,     vr11
542    vbitsel.v      vr7,     vr0,     vr9,    vr12
543    vsle.hu        vr12,    vr10,    vr5
544    vsle.hu        vr8,     vr10,    vr11
545    vand.v         vr12,    vr12,    vr8
546    vbitsel.v      vr12,    vr7,     vr1,    vr12
547    vsrlni.b.h     vr12,    vr12,    0
548
549    vpermi.w       vr12,    vr3,     0x44
550
551    vst            vr12,    a0,      0
552
553    vld            vr2,     a7,      16   //top
554    vpermi.w       vr9,     vr2,     0x0e
555    vsllwil.hu.bu  vr2,     vr2,     0
556    vsllwil.hu.bu  vr9,     vr9,     0
557
558    vabsd.hu       vr5,     vr0,     vr1  //tdiff
559    vabsd.hu       vr4,     vr0,     vr2  //ldiff
560    vabsd.hu       vr10,    vr0,     vr9
561
562    vadd.h         vr3,     vr0,     vr0
563    vadd.h         vr6,     vr1,     vr2
564    vadd.h         vr11,    vr1,     vr9
565    vabsd.hu       vr6,     vr3,     vr6  //tldiff
566    vabsd.hu       vr11,    vr3,     vr11 //tldiff
567
568    vsle.hu        vr3,     vr5,     vr6
569    vbitsel.v      vr7,     vr0,     vr2,    vr3
570    vsle.hu        vr3,     vr4,     vr5
571    vsle.hu        vr8,     vr4,     vr6
572    vand.v         vr3,     vr3,     vr8
573    vbitsel.v      vr3,     vr7,     vr1,    vr3
574    vsrlni.b.h     vr3,     vr3,     0
575
576    vsle.hu        vr12,    vr5,     vr11
577    vbitsel.v      vr7,     vr0,     vr9,    vr12
578    vsle.hu        vr12,    vr10,    vr5
579    vsle.hu        vr8,     vr10,    vr11
580    vand.v         vr12,    vr12,    vr8
581    vbitsel.v      vr12,    vr7,     vr1,    vr12
582    vsrlni.b.h     vr12,    vr12,    0
583
584    vpermi.w       vr12,    vr3,     0x44
585
586    vst            vr12,    a0,      16
587
588    vld            vr2,     a7,      32   //top
589    vpermi.w       vr9,     vr2,     0x0e
590    vsllwil.hu.bu  vr2,     vr2,     0
591    vsllwil.hu.bu  vr9,     vr9,     0
592
593    vabsd.hu       vr5,     vr0,     vr1  //tdiff
594    vabsd.hu       vr4,     vr0,     vr2  //ldiff
595    vabsd.hu       vr10,    vr0,     vr9
596
597    vadd.h         vr3,     vr0,     vr0
598    vadd.h         vr6,     vr1,     vr2
599    vadd.h         vr11,    vr1,     vr9
600    vabsd.hu       vr6,     vr3,     vr6  //tldiff
601    vabsd.hu       vr11,    vr3,     vr11 //tldiff
602
603    vsle.hu        vr3,     vr5,     vr6
604    vbitsel.v      vr7,     vr0,     vr2,    vr3
605    vsle.hu        vr3,     vr4,     vr5
606    vsle.hu        vr8,     vr4,     vr6
607    vand.v         vr3,     vr3,     vr8
608    vbitsel.v      vr3,     vr7,     vr1,    vr3
609    vsrlni.b.h     vr3,     vr3,     0
610
611    vsle.hu        vr12,    vr5,     vr11
612    vbitsel.v      vr7,     vr0,     vr9,    vr12
613    vsle.hu        vr12,    vr10,    vr5
614    vsle.hu        vr8,     vr10,    vr11
615    vand.v         vr12,    vr12,    vr8
616    vbitsel.v      vr12,    vr7,     vr1,    vr12
617    vsrlni.b.h     vr12,    vr12,    0
618
619    vpermi.w       vr12,    vr3,     0x44
620
621    vst            vr12,    a0,      32
622
623    vld            vr2,     a7,      48   //top
624    vpermi.w       vr9,     vr2,     0x0e
625    vsllwil.hu.bu  vr2,     vr2,     0
626    vsllwil.hu.bu  vr9,     vr9,     0
627
628    vabsd.hu       vr5,     vr0,     vr1  //tdiff
629    vabsd.hu       vr4,     vr0,     vr2  //ldiff
630    vabsd.hu       vr10,    vr0,     vr9
631
632    vadd.h         vr3,     vr0,     vr0
633    vadd.h         vr6,     vr1,     vr2
634    vadd.h         vr11,    vr1,     vr9
635    vabsd.hu       vr6,     vr3,     vr6  //tldiff
636    vabsd.hu       vr11,    vr3,     vr11 //tldiff
637
638    vsle.hu        vr3,     vr5,     vr6
639    vbitsel.v      vr7,     vr0,     vr2,    vr3
640    vsle.hu        vr3,     vr4,     vr5
641    vsle.hu        vr8,     vr4,     vr6
642    vand.v         vr3,     vr3,     vr8
643    vbitsel.v      vr3,     vr7,     vr1,    vr3
644    vsrlni.b.h     vr3,     vr3,     0
645
646    vsle.hu        vr12,    vr5,     vr11
647    vbitsel.v      vr7,     vr0,     vr9,    vr12
648    vsle.hu        vr12,    vr10,    vr5
649    vsle.hu        vr8,     vr10,    vr11
650    vand.v         vr12,    vr12,    vr8
651    vbitsel.v      vr12,    vr7,     vr1,    vr12
652    vsrlni.b.h     vr12,    vr12,    0
653
654    vpermi.w       vr12,    vr3,     0x44
655
656    vst            vr12,    a0,      48
657
658    b              .IPRED_PAETH_W_LOOPEND
659
660.IPRED_PAETH_W_LOOP32:
661    andi           a5,      a3,      32
662    beqz           a5,      .IPRED_PAETH_W_LOOP16
663
664    vld            vr2,     a7,      0   //top
665    vpermi.w       vr9,     vr2,     0x0e
666    vsllwil.hu.bu  vr2,     vr2,     0
667    vsllwil.hu.bu  vr9,     vr9,     0
668
669    vabsd.hu       vr5,     vr0,     vr1  //tdiff
670    vabsd.hu       vr4,     vr0,     vr2  //ldiff
671    vabsd.hu       vr10,    vr0,     vr9
672
673    vadd.h         vr3,     vr0,     vr0
674    vadd.h         vr6,     vr1,     vr2
675    vadd.h         vr11,    vr1,     vr9
676    vabsd.hu       vr6,     vr3,     vr6  //tldiff
677    vabsd.hu       vr11,    vr3,     vr11 //tldiff
678
679    vsle.hu        vr3,     vr5,     vr6
680    vbitsel.v      vr7,     vr0,     vr2,    vr3
681    vsle.hu        vr3,     vr4,     vr5
682    vsle.hu        vr8,     vr4,     vr6
683    vand.v         vr3,     vr3,     vr8
684    vbitsel.v      vr3,     vr7,     vr1,    vr3
685    vsrlni.b.h     vr3,     vr3,     0
686
687    vsle.hu        vr12,    vr5,     vr11
688    vbitsel.v      vr7,     vr0,     vr9,    vr12
689    vsle.hu        vr12,    vr10,    vr5
690    vsle.hu        vr8,     vr10,    vr11
691    vand.v         vr12,    vr12,    vr8
692    vbitsel.v      vr12,    vr7,     vr1,    vr12
693    vsrlni.b.h     vr12,    vr12,    0
694
695    vpermi.w       vr12,    vr3,     0x44
696
697    vst            vr12,    a0,      0
698
699    vld            vr2,     a7,      16   //top
700    vpermi.w       vr9,     vr2,     0x0e
701    vsllwil.hu.bu  vr2,     vr2,     0
702    vsllwil.hu.bu  vr9,     vr9,     0
703
704    vabsd.hu       vr5,     vr0,     vr1  //tdiff
705    vabsd.hu       vr4,     vr0,     vr2  //ldiff
706    vabsd.hu       vr10,    vr0,     vr9
707
708    vadd.h         vr3,     vr0,     vr0
709    vadd.h         vr6,     vr1,     vr2
710    vadd.h         vr11,    vr1,     vr9
711    vabsd.hu       vr6,     vr3,     vr6  //tldiff
712    vabsd.hu       vr11,    vr3,     vr11 //tldiff
713
714    vsle.hu        vr3,     vr5,     vr6
715    vbitsel.v      vr7,     vr0,     vr2,    vr3
716    vsle.hu        vr3,     vr4,     vr5
717    vsle.hu        vr8,     vr4,     vr6
718    vand.v         vr3,     vr3,     vr8
719    vbitsel.v      vr3,     vr7,     vr1,    vr3
720    vsrlni.b.h     vr3,     vr3,     0
721
722    vsle.hu        vr12,    vr5,     vr11
723    vbitsel.v      vr7,     vr0,     vr9,    vr12
724    vsle.hu        vr12,    vr10,    vr5
725    vsle.hu        vr8,     vr10,    vr11
726    vand.v         vr12,    vr12,    vr8
727    vbitsel.v      vr12,    vr7,     vr1,    vr12
728    vsrlni.b.h     vr12,    vr12,    0
729
730    vpermi.w       vr12,    vr3,     0x44
731
732    vst            vr12,    a0,      16
733
734    b              .IPRED_PAETH_W_LOOPEND
735
736.IPRED_PAETH_W_LOOP16:
737    andi           a5,      a3,      16
738    beqz           a5,      .IPRED_PAETH_W_LOOP8
739
740    vld            vr2,     a7,      0   //top
741    vpermi.w       vr9,     vr2,     0x0e
742    vsllwil.hu.bu  vr2,     vr2,     0
743    vsllwil.hu.bu  vr9,     vr9,     0
744
745    vabsd.hu       vr5,     vr0,     vr1  //tdiff
746    vabsd.hu       vr4,     vr0,     vr2  //ldiff
747    vabsd.hu       vr10,    vr0,     vr9
748
749    vadd.h         vr3,     vr0,     vr0
750    vadd.h         vr6,     vr1,     vr2
751    vadd.h         vr11,    vr1,     vr9
752    vabsd.hu       vr6,     vr3,     vr6  //tldiff
753    vabsd.hu       vr11,    vr3,     vr11 //tldiff
754
755    vsle.hu        vr3,     vr5,     vr6
756    vbitsel.v      vr7,     vr0,     vr2,    vr3
757    vsle.hu        vr3,     vr4,     vr5
758    vsle.hu        vr8,     vr4,     vr6
759    vand.v         vr3,     vr3,     vr8
760    vbitsel.v      vr3,     vr7,     vr1,    vr3
761    vsrlni.b.h     vr3,     vr3,     0
762
763    vsle.hu        vr12,    vr5,     vr11
764    vbitsel.v      vr7,     vr0,     vr9,    vr12
765    vsle.hu        vr12,    vr10,    vr5
766    vsle.hu        vr8,     vr10,    vr11
767    vand.v         vr12,    vr12,    vr8
768    vbitsel.v      vr12,    vr7,     vr1,    vr12
769    vsrlni.b.h     vr12,    vr12,    0
770
771    vpermi.w       vr12,    vr3,     0x44
772
773    vst            vr12,    a0,      0
774
775    b              .IPRED_PAETH_W_LOOPEND
776
777.IPRED_PAETH_W_LOOP8:
778    andi           a5,      a3,      8
779    beqz           a5,      .IPRED_PAETH_W_LOOP4
780
781    fld.d          f2,      a7,      0   //top
782    vsllwil.hu.bu  vr2,     vr2,     0
783
784    vabsd.hu       vr5,     vr0,     vr1  //tdiff
785    vabsd.hu       vr4,     vr0,     vr2  //ldiff
786
787    vadd.h         vr3,     vr0,     vr0
788    vadd.h         vr6,     vr1,     vr2
789    vabsd.hu       vr6,     vr3,     vr6 //tldiff
790
791    vsle.hu        vr3,     vr5,     vr6
792    vbitsel.v      vr7,     vr0,     vr2,    vr3
793    vsle.hu        vr3,     vr4,     vr5
794    vsle.hu        vr8,     vr4,     vr6
795    vand.v         vr3,     vr3,     vr8
796    vbitsel.v      vr3,     vr7,     vr1,    vr3
797    vsrlni.b.h     vr3,     vr3,     0
798    fst.d          f3,      a0,      0
799
800    b              .IPRED_PAETH_W_LOOPEND
801
802.IPRED_PAETH_W_LOOP4:
803    andi           a5,      a3,      4
804    beqz           a5,      .IPRED_PAETH_W_LOOPEND
805
806    fld.s          f2,      a7,      0   //top
807    vsllwil.hu.bu  vr2,     vr2,     0
808
809    vabsd.hu       vr5,     vr0,     vr1  //tdiff
810    vabsd.hu       vr4,     vr0,     vr2  //ldiff
811
812    vadd.h         vr3,     vr0,     vr0
813    vadd.h         vr6,     vr1,     vr2
814    vabsd.hu       vr6,     vr3,     vr6 //tldiff
815
816    vsle.hu        vr3,     vr5,     vr6
817    vbitsel.v      vr7,     vr0,     vr2,    vr3
818    vsle.hu        vr3,     vr4,     vr5
819    vsle.hu        vr8,     vr4,     vr6
820    vand.v         vr3,     vr3,     vr8
821    vbitsel.v      vr3,     vr7,     vr1,    vr3
822    vsrlni.b.h     vr3,     vr3,     0
823    fst.s          f3,      a0,      0
824
825    b              .IPRED_PAETH_W_LOOPEND
826
827.IPRED_PAETH_W_LOOPEND:
828    add.d         a0,       a0,      a1
829    addi.d        a4,       a4,      -1
830    bnez          a4,       .IPRED_PAETH_H_LOOP
831endfunc
832
833const dav1d_sm_weights
834    .byte  0,   0
835    // bs = 2
836    .byte  255, 128
837    // bs = 4
838    .byte  255, 149,  85,  64
839    // bs = 8
840    .byte  255, 197, 146, 105,  73,  50,  37,  32
841    // bs = 16
842    .byte  255, 225, 196, 170, 145, 123, 102,  84
843    .byte  68,  54,  43,  33,  26,  20,  17,  16
844    // bs = 32
845    .byte  255, 240, 225, 210, 196, 182, 169, 157
846    .byte  145, 133, 122, 111, 101,  92,  83,  74
847    .byte  66,  59,  52,  45,  39,  34,  29,  25
848    .byte  21,  17,  14,  12,  10,   9,   8,   8
849    // bs = 64
850    .byte  255, 248, 240, 233, 225, 218, 210, 203
851    .byte  196, 189, 182, 176, 169, 163, 156, 150
852    .byte  144, 138, 133, 127, 121, 116, 111, 106
853    .byte  101,  96,  91,  86,  82,  77,  73,  69
854    .byte  65,  61,  57,  54,  50,  47,  44,  41
855    .byte  38,  35,  32,  29,  27,  25,  22,  20
856    .byte  18,  16,  15,  13,  12,  10,   9,   8
857    .byte  7,   6,   6,   5,   5,   4,   4,   4
858endconst
859
860// void ipred_smooth_lsx(pixel *dst, const ptrdiff_t stride,
861//                       const pixel *const topleft,
862//                       const int width, const int height, const int a,
863//                       const int max_width, const int max_height
864//                       HIGHBD_DECL_SUFFIX)
865function ipred_smooth_8bpc_lsx
866    la.local       a5,      dav1d_sm_weights
867    add.d          a6,      a5,      a3  //hor
868    add.d          a5,      a5,      a4  //ver
869
870    add.d          a7,      a2,      a3
871    sub.d          t0,      a2,      a4
872
873    vldrepl.b      vr0,     a7,      0  //right
874    vldrepl.b      vr1,     t0,      0  //bottom
875
876    vsllwil.hu.bu  vr0,     vr0,     0
877    vsllwil.wu.hu  vr0,     vr0,     0
878    vsllwil.hu.bu  vr1,     vr1,     0
879    vsllwil.wu.hu  vr1,     vr1,     0
880
881    li.w           t0,      256
882    vreplgr2vr.w   vr6,     t0
883
884    addi.d         t0,      a2,      1   //ptr topleft[x]
885    addi.d         t3,      a2,      -1  //ptr topleft[y]
886
887.IPRED_SMOOTH_H_LOOP:
888    vldrepl.b      vr2,     a5,      0  //ver[y]
889    vldrepl.b      vr3,     t3,      0  //topleft[y]
890
891    vsllwil.hu.bu  vr2,     vr2,     0
892    vsllwil.wu.hu  vr2,     vr2,     0
893    vsllwil.hu.bu  vr3,     vr3,     0
894    vsllwil.wu.hu  vr3,     vr3,     0
895
896    vsub.w         vr7,     vr6,     vr2  //256-ver[y]
897
898    or             t1,      zero,    zero  //xx
899    srai.d         t2,      a3,      2     //loop max
900
901.IPRED_SMOOTH_W_LOOP:
902    fldx.s         f4,      t0,      t1   //topleft[x]
903    fldx.s         f5,      a6,      t1   //hor[x]
904
905    vsllwil.hu.bu  vr4,     vr4,     0
906    vsllwil.wu.hu  vr4,     vr4,     0
907    vsllwil.hu.bu  vr5,     vr5,     0
908    vsllwil.wu.hu  vr5,     vr5,     0
909
910    vsub.w         vr8,     vr6,     vr5  //256-hor[x]
911
912    vmul.w         vr9,     vr8,     vr0
913    vmadd.w        vr9,     vr5,     vr3
914    vmadd.w        vr9,     vr7,     vr1
915    vmadd.w        vr9,     vr2,     vr4  //pred
916
917    vadd.w         vr9,     vr9,     vr6
918    vsrlni.h.w     vr9,     vr9,     9
919    vsrlni.b.h     vr9,     vr9,     0
920
921    fstx.s         f9,      a0,      t1
922
923    addi.d         t1,      t1,      4
924    addi.d         t2,      t2,      -1
925    bnez           t2,      .IPRED_SMOOTH_W_LOOP
926
927.IPRED_SMOOTH_W_LOOP_END:
928    addi.d         t3,      t3,      -1
929    addi.d         a5,      a5,      1
930    add.d          a0,      a0,      a1
931    addi.d         a4,      a4,      -1
932    bnez           a4,      .IPRED_SMOOTH_H_LOOP
933
934endfunc
935
936// void ipred_smooth_v_lsx(pixel *dst, const ptrdiff_t stride,
937//                         const pixel *const topleft,
938//                         const int width, const int height, const int a,
939//                         const int max_width, const int max_height
940//                         HIGHBD_DECL_SUFFIX)
941function ipred_smooth_v_8bpc_lsx
942    la.local       a5,      dav1d_sm_weights
943    add.d          a5,      a5,      a4  //ver
944
945    sub.d          t0,      a2,      a4
946    vldrepl.b      vr0,     t0,      0  //bottom
947    vsllwil.hu.bu  vr0,     vr0,     0
948
949    li.w           t0,      256
950    vreplgr2vr.h   vr2,     t0
951    li.w           t0,      128
952    vreplgr2vr.h   vr3,     t0
953
954    addi.d         t0,      a2,      1   //ptr topleft[x]
955
956.IPRED_SMOOTH_V_H_LOOP:
957    vldrepl.b      vr1,     a5,      0  //ver[y]
958    vsllwil.hu.bu  vr1,     vr1,     0
959    vsub.h         vr5,     vr2,     vr1  //256-ver[y]
960
961    or             t1,      zero,    zero  //xx
962    srai.d         t2,      a3,      3     //loop max
963    beqz           t2,      .IPRED_SMOOTH_V_W_LOOP4
964
965.IPRED_SMOOTH_V_W_LOOP8:
966    fldx.d         f4,      t0,      t1   //topleft[x]
967    vsllwil.hu.bu  vr4,     vr4,     0
968
969    vmul.h         vr6,     vr5,     vr0
970    vmadd.h        vr6,     vr1,     vr4  //pred
971    vadd.h         vr6,     vr6,     vr3
972    vsrlni.b.h     vr6,     vr6,     8
973
974    fstx.d         f6,      a0,      t1
975
976    addi.d         t1,      t1,      8
977    addi.d         t2,      t2,      -1
978    bnez           t2,      .IPRED_SMOOTH_V_W_LOOP8
979    b              .IPRED_SMOOTH_V_W_LOOP_END
980
981.IPRED_SMOOTH_V_W_LOOP4:
982    fldx.s         f4,      t0,      t1   //topleft[x]
983    vsllwil.hu.bu  vr4,     vr4,     0
984
985    vmul.h         vr6,     vr5,     vr0
986    vmadd.h        vr6,     vr1,     vr4  //pred
987    vadd.h         vr6,     vr6,     vr3
988    vsrai.h        vr6,     vr6,     8
989    vsrlni.b.h     vr6,     vr6,     0
990
991    fstx.s         f6,      a0,      t1
992
993    addi.d         t1,      t1,      4
994
995.IPRED_SMOOTH_V_W_LOOP_END:
996    addi.d         a5,      a5,      1
997    add.d          a0,      a0,      a1
998    addi.d         a4,      a4,      -1
999    bnez           a4,      .IPRED_SMOOTH_V_H_LOOP
1000
1001endfunc
1002
1003// void ipred_smooth_h_lsx(pixel *dst, const ptrdiff_t stride,
1004//                         const pixel *const topleft,
1005//                         const int width, const int height, const int a,
1006//                         const int max_width, const int max_height
1007//                         HIGHBD_DECL_SUFFIX)
1008function ipred_smooth_h_8bpc_lsx
1009    la.local       a5,      dav1d_sm_weights
1010    add.d          a6,      a5,      a3  //hor
1011
1012    add.d          a7,      a2,      a3
1013    vldrepl.b      vr0,     a7,      0  //right
1014    vsllwil.hu.bu  vr0,     vr0,     0
1015
1016    li.w           t0,      256
1017    vreplgr2vr.h   vr1,     t0
1018    li.w           t0,      128
1019    vreplgr2vr.h   vr2,     t0
1020
1021    addi.d         t3,      a2,      -1  //ptr topleft[y]
1022
1023.IPRED_SMOOTH_H_H_LOOP:
1024    vldrepl.b      vr3,     t3,      0  //topleft[y]
1025    vsllwil.hu.bu  vr3,     vr3,     0
1026
1027    or             t1,      zero,    zero  //xx
1028    srai.d         t2,      a3,      3     //loop max
1029    beqz           t2,      .IPRED_SMOOTH_H_W_LOOP4
1030
1031.IPRED_SMOOTH_H_W_LOOP8:
1032    fldx.d         f5,      a6,      t1   //hor[x]
1033    vsllwil.hu.bu  vr5,     vr5,     0
1034    vsub.h         vr4,     vr1,     vr5  //256-hor[x]
1035
1036    vmul.h         vr6,     vr4,     vr0
1037    vmadd.h        vr6,     vr5,     vr3  //pred
1038    vadd.h         vr6,     vr6,     vr2
1039    vsrlni.b.h     vr6,     vr6,     8
1040
1041    fstx.d         f6,      a0,      t1
1042
1043    addi.d         t1,      t1,      8
1044    addi.d         t2,      t2,      -1
1045    bnez           t2,      .IPRED_SMOOTH_H_W_LOOP8
1046    b              .IPRED_SMOOTH_W_H_LOOP_END
1047
1048.IPRED_SMOOTH_H_W_LOOP4:
1049    fldx.s         f5,      a6,      t1   //hor[x]
1050    vsllwil.hu.bu  vr5,     vr5,     0
1051    vsub.h         vr4,     vr1,     vr5  //256-hor[x]
1052
1053    vmul.h         vr6,     vr4,     vr0
1054    vmadd.h        vr6,     vr5,     vr3  //pred
1055    vadd.h         vr6,     vr6,     vr2
1056    vsrai.h        vr6,     vr6,     8
1057    vsrlni.b.h     vr6,     vr6,     0
1058
1059    fstx.s         f6,      a0,      t1
1060
1061    addi.d         t1,      t1,      4
1062
1063.IPRED_SMOOTH_W_H_LOOP_END:
1064    addi.d         t3,      t3,      -1
1065    add.d          a0,      a0,      a1
1066    addi.d         a4,      a4,      -1
1067    bnez           a4,      .IPRED_SMOOTH_H_H_LOOP
1068
1069endfunc
1070
1071// void pal_pred_lsx(pixel *dst, const ptrdiff_t stride,
1072//                   const pixel *const pal, const uint8_t *idx,
1073//                   const int w, const int h)
1074function pal_pred_8bpc_lsx
1075    srai.d         a7,      a5,      2
1076
1077.PAL_PRED_WLOOP4:
1078    andi           a6,      a4,      4
1079    beqz           a6,      .PAL_PRED_WLOOP8
1080    fld.d          f0,      a3,      0
1081    vsrli.b        vr1,     vr0,     4
1082    vandi.b        vr2,     vr0,     7
1083    vilvl.b        vr0,     vr1,     vr2
1084    fld.d          f1,      a2,      0
1085    vshuf.b        vr2,     vr1,     vr1,    vr0
1086
1087    vstelm.w       vr2,     a0,      0,      0
1088    add.d          a0,      a0,      a1
1089    vstelm.w       vr2,     a0,      0,      1
1090    add.d          a0,      a0,      a1
1091    vstelm.w       vr2,     a0,      0,      2
1092    add.d          a0,      a0,      a1
1093    vstelm.w       vr2,     a0,      0,      3
1094    add.d          a0,      a0,      a1
1095
1096    addi.d         a3,      a3,      8
1097    addi.d         a7,      a7,      -1
1098    bnez           a7,      .PAL_PRED_WLOOP4
1099    b              .PAL_PRED_END
1100
1101.PAL_PRED_WLOOP8:
1102    andi           a6,      a4,      8
1103    beqz           a6,      .PAL_PRED_WLOOP16
1104
1105    vld            vr0,     a3,      0
1106    vsrli.b        vr1,     vr0,     4
1107    vandi.b        vr2,     vr0,     7
1108    vilvl.b        vr0,     vr1,     vr2
1109    vilvh.b        vr3,     vr1,     vr2
1110    fld.d          f1,      a2,      0
1111    vshuf.b        vr0,     vr1,     vr1,    vr0
1112    vshuf.b        vr3,     vr1,     vr1,    vr3
1113
1114    vstelm.d       vr0,     a0,      0,      0
1115    add.d          a0,      a0,      a1
1116    vstelm.d       vr0,     a0,      0,      1
1117    add.d          a0,      a0,      a1
1118
1119    vstelm.d       vr3,     a0,      0,      0
1120    add.d          a0,      a0,      a1
1121    vstelm.d       vr3,     a0,      0,      1
1122    add.d          a0,      a0,      a1
1123
1124    addi.d         a3,      a3,      16
1125    addi.d         a7,      a7,      -1
1126    bnez           a7,      .PAL_PRED_WLOOP8
1127    b              .PAL_PRED_END
1128
1129.PAL_PRED_WLOOP16:
1130    andi           a6,      a4,      16
1131    beqz           a6,      .PAL_PRED_WLOOP32
1132
1133    vld            vr0,     a3,      0
1134    vld            vr1,     a3,      16
1135    fld.d          f6,      a2,      0
1136    vsrli.b        vr2,     vr0,     4
1137    vandi.b        vr3,     vr0,     7
1138    vsrli.b        vr4,     vr1,     4
1139    vandi.b        vr5,     vr1,     7
1140    vilvl.b        vr0,     vr2,     vr3
1141    vilvh.b        vr1,     vr2,     vr3
1142    vilvl.b        vr2,     vr4,     vr5
1143    vilvh.b        vr3,     vr4,     vr5
1144    vshuf.b        vr0,     vr6,     vr6,    vr0
1145    vshuf.b        vr1,     vr6,     vr6,    vr1
1146    vshuf.b        vr2,     vr6,     vr6,    vr2
1147    vshuf.b        vr3,     vr6,     vr6,    vr3
1148
1149    vst            vr0,     a0,      0
1150    add.d          a0,      a0,      a1
1151    vst            vr1,     a0,      0
1152    add.d          a0,      a0,      a1
1153    vst            vr2,     a0,      0
1154    add.d          a0,      a0,      a1
1155    vst            vr3,     a0,      0
1156    add.d          a0,      a0,      a1
1157
1158    addi.d         a3,      a3,      32
1159    addi.d         a7,      a7,      -1
1160    bnez           a7,      .PAL_PRED_WLOOP16
1161    b              .PAL_PRED_END
1162
1163.PAL_PRED_WLOOP32:
1164    andi           a6,      a4,      32
1165    beqz           a6,      .PAL_PRED_WLOOP64
1166
1167    vld            vr0,     a3,      0
1168    vld            vr1,     a3,      16
1169    vld            vr2,     a3,      32
1170    vld            vr3,     a3,      48
1171    fld.d          f4,      a2,      0
1172    vsrli.b        vr5,     vr0,     4
1173    vandi.b        vr6,     vr0,     7
1174    vsrli.b        vr7,     vr1,     4
1175    vandi.b        vr8,     vr1,     7
1176    vsrli.b        vr9,     vr2,     4
1177    vandi.b        vr10,    vr2,     7
1178    vsrli.b        vr11,    vr3,     4
1179    vandi.b        vr12,    vr3,     7
1180    vilvl.b        vr0,     vr5,     vr6
1181    vilvh.b        vr1,     vr5,     vr6
1182    vilvl.b        vr2,     vr7,     vr8
1183    vilvh.b        vr3,     vr7,     vr8
1184    vilvl.b        vr5,     vr9,     vr10
1185    vilvh.b        vr6,     vr9,     vr10
1186    vilvl.b        vr7,     vr11,    vr12
1187    vilvh.b        vr8,     vr11,    vr12
1188    vshuf.b        vr0,     vr4,     vr4,    vr0
1189    vshuf.b        vr1,     vr4,     vr4,    vr1
1190    vshuf.b        vr2,     vr4,     vr4,    vr2
1191    vshuf.b        vr3,     vr4,     vr4,    vr3
1192    vshuf.b        vr5,     vr4,     vr4,    vr5
1193    vshuf.b        vr6,     vr4,     vr4,    vr6
1194    vshuf.b        vr7,     vr4,     vr4,    vr7
1195    vshuf.b        vr8,     vr4,     vr4,    vr8
1196
1197    vst            vr0,     a0,      0
1198    vst            vr1,     a0,      16
1199    add.d          a0,      a0,      a1
1200    vst            vr2,     a0,      0
1201    vst            vr3,     a0,      16
1202    add.d          a0,      a0,      a1
1203    vst            vr5,     a0,      0
1204    vst            vr6,     a0,      16
1205    add.d          a0,      a0,      a1
1206    vst            vr7,     a0,      0
1207    vst            vr8,     a0,      16
1208    add.d          a0,      a0,      a1
1209
1210    addi.d         a3,      a3,      64
1211    addi.d         a7,      a7,      -1
1212    bnez           a7,      .PAL_PRED_WLOOP32
1213    b              .PAL_PRED_END
1214
1215.PAL_PRED_WLOOP64:
1216    vld            vr0,     a3,      0
1217    vld            vr1,     a3,      16
1218    fld.d          f2,      a2,      0
1219    vsrli.b        vr3,     vr0,     4
1220    vandi.b        vr4,     vr0,     7
1221    vsrli.b        vr5,     vr1,     4
1222    vandi.b        vr6,     vr1,     7
1223    vilvl.b        vr0,     vr3,     vr4
1224    vilvh.b        vr1,     vr3,     vr4
1225    vilvl.b        vr3,     vr5,     vr6
1226    vilvh.b        vr4,     vr5,     vr6
1227    vshuf.b        vr0,     vr2,     vr2,    vr0
1228    vshuf.b        vr1,     vr2,     vr2,    vr1
1229    vshuf.b        vr3,     vr2,     vr2,    vr3
1230    vshuf.b        vr4,     vr2,     vr2,    vr4
1231
1232    vst            vr0,     a0,      0
1233    vst            vr1,     a0,      16
1234    vst            vr3,     a0,      32
1235    vst            vr4,     a0,      48
1236
1237    add.d          a0,      a0,      a1
1238    addi.d         a3,      a3,      32
1239    addi.d         a5,      a5,      -1
1240    bnez           a5,      .PAL_PRED_WLOOP64
1241
1242.PAL_PRED_END:
1243endfunc
1244
1245.macro apply_sign_vrh v, s, vrzero, vrt0 ,out
1246    vslt.h         \vrt0,   \s,      \vrzero
1247    vandn.v        \s,      \vrt0,   \v
1248    vsigncov.h     \v,      \vrt0,   \v
1249    vor.v          \out,    \s,      \v
1250.endm
1251
1252.macro iclip_pixel_vrh in0, in1, in2, tmp0, tmp1, out
1253    vmin.h         \tmp0,   \in2,    \in0
1254    vslt.h         \in0,    \in0,    \in1
1255    vand.v         \tmp1,   \in0,    \in1
1256    vandn.v        \tmp0,   \in0,    \tmp0
1257    vor.v          \out,    \tmp1,   \tmp0
1258.endm
1259
1260.macro ipred_cfl_pred dst, stride, w, h, dc, ac, alpha
1261    vreplgr2vr.h   vr2,     \alpha
1262    vreplgr2vr.h   vr7,     \dc
1263    li.w           t1,      32
1264    vreplgr2vr.h   vr3,     t1
1265    vxor.v         vr4,     vr4,     vr4
1266    li.w           t1,      255
1267    vreplgr2vr.h   vr6,     t1
1268    add.d          t4,      \w,      \w
1269
12701:
1271    or             t1,      zero,    zero
1272    or             t2,      zero,    zero
1273    srai.d         t3,      \w,      3
1274    beqz           t3,      3f
1275
12762:
1277    vldx           vr0,     \ac,     t1
1278    vmul.h         vr1,     vr2,     vr0
1279    vadda.h        vr0,     vr1,     vr3
1280    vsrai.h        vr0,     vr0,     6
1281    apply_sign_vrh vr0, vr1, vr4, vr5, vr0
1282    vadd.h         vr1,     vr0,     vr7
1283    iclip_pixel_vrh vr1, vr4, vr6, vr5, vr8, vr0
1284    vsrlni.b.h     vr0,     vr0,     0
1285    fstx.d         f0,      \dst,    t2
1286
1287    addi.d         t1,      t1,      16
1288    addi.d         t2,      t2,      8
1289    addi.d         t3,      t3,      -1
1290    bnez           t3,      2b
1291    b              4f
1292
12933:
1294    fld.d          f0,      \ac,     0
1295    vmul.h         vr1,     vr2,     vr0
1296    vadda.h        vr0,     vr1,     vr3
1297    vsrai.h        vr0,     vr0,     6
1298    apply_sign_vrh vr0, vr1, vr4, vr5, vr0
1299    vadd.h         vr1,     vr0,     vr7
1300    iclip_pixel_vrh vr1, vr4, vr6, vr5, vr8, vr0
1301    vsrlni.b.h     vr0,     vr0,     0
1302    fst.s          f0,      \dst,    0
1303
13044:
1305    add.d          \ac,     \ac,     t4
1306    add.d          \dst,    \dst,    \stride
1307    addi.d         \h,      \h,      -1
1308    bnez           \h,      1b
1309.endm
1310
1311function ipred_cfl_8bpc_lsx
1312    ipred_dc_gen   a2, a3, a4
1313    ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6
1314endfunc
1315
1316function ipred_cfl_top_8bpc_lsx
1317    ipred_dc_gen_top   a2, a3
1318    ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6
1319endfunc
1320
1321function ipred_cfl_left_8bpc_lsx
1322    ipred_dc_gen_left   a2, a4
1323    ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6
1324endfunc
1325
1326function ipred_cfl_128_8bpc_lsx
1327    li.w           t0,      128
1328    ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6
1329endfunc
1330
1331const dav1d_filter_intra_taps_lsx
1332    //arr0  8*7
1333.byte    -6, -5, -3, -3, -4, -3, -3, -3
1334.byte    10,  2,  1,  1,  6,  2,  2,  1
1335.byte    0, 10,  1,  1,  0,  6,  2,  2
1336.byte    0,  0, 10,  2,  0,  0,  6,  2
1337.byte    0,  0,  0, 10,  0,  0,  0,  6
1338.byte    12,  9,  7,  5,  2,  2,  2,  3
1339.byte    0,  0,  0,  0, 12,  9,  7,  5
1340    //arr1
1341.byte    -10,  -6,  -4,  -2, -10,  -6,  -4,  -2
1342.byte    16,   0,   0,   0,  16,   0,   0,   0
1343.byte    0,  16,   0,   0,   0,  16,   0,   0
1344.byte    0,   0,  16,   0,   0,   0,  16,   0
1345.byte    0,   0,   0,  16,   0,   0,   0,  16
1346.byte    10,   6,   4,   2,   0,   0,   0,   0
1347.byte    0,   0,   0,   0,  10,   6,   4,   2
1348    //arr2
1349.byte    -8,  -8,  -8,  -8,  -4,  -4,  -4,  -4
1350.byte    8,   0,   0,   0,   4,   0,   0,   0
1351.byte    0,   8,   0,   0,   0,   4,   0,   0
1352.byte    0,   0,   8,   0,   0,   0,   4,   0
1353.byte    0,   0,   0,   8,   0,   0,   0,   4
1354.byte    16,  16,  16,  16,   0,   0,   0,   0
1355.byte    0,   0,   0,   0,  16,  16,  16,  16
1356    //arr3
1357.byte    -2,  -1,  -1,   0,  -1,  -1,  -1,  -1
1358.byte    8,   3,   2,   1,   4,   3,   2,   2
1359.byte    0,   8,   3,   2,   0,   4,   3,   2
1360.byte    0,   0,   8,   3,   0,   0,   4,   3
1361.byte    0,   0,   0,   8,   0,   0,   0,   4
1362.byte    10,   6,   4,   2,   3,   4,   4,   3
1363.byte    0,   0,   0,   0,  10,   6,   4,   3
1364    //arr4
1365.byte    -12, -10,  -9,  -8, -10,  -9,  -8,  -7
1366.byte    14,   0,   0,   0,  12,   1,   0,   0
1367.byte    0,  14,   0,   0,   0,  12,   0,   0
1368.byte    0,   0,  14,   0,   0,   0,  12,   1
1369.byte    0,   0,   0,  14,   0,   0,   0,  12
1370.byte    14,  12,  11,  10,   0,   0,   1,   1
1371.byte    0,   0,   0,   0,  14,  12,  11,   9
1372endconst
1373
1374.macro ipred_filter_load_p
1375    vldrepl.b      vr0,     t0,      0
1376    vldrepl.b      vr1,     a7,      0
1377    vldrepl.b      vr2,     a7,      1
1378    vldrepl.b      vr3,     a7,      2
1379    vldrepl.b      vr4,     a7,      3
1380    vldrepl.b      vr5,     t1,      0
1381    vldrepl.b      vr6,     t1,      -1
1382
1383    vsllwil.hu.bu  vr0,     vr0,     0
1384    vsllwil.hu.bu  vr1,     vr1,     0
1385    vsllwil.hu.bu  vr2,     vr2,     0
1386    vsllwil.hu.bu  vr3,     vr3,     0
1387    vsllwil.hu.bu  vr4,     vr4,     0
1388    vsllwil.hu.bu  vr5,     vr5,     0
1389    vsllwil.hu.bu  vr6,     vr6,     0
1390.endm
1391
1392.macro ipred_filter_loadx_p
1393    vldrepl.b      vr0,     t0,      0
1394    vldrepl.b      vr1,     a7,      0
1395    vldrepl.b      vr2,     a7,      1
1396    vldrepl.b      vr3,     a7,      2
1397    vldrepl.b      vr4,     a7,      3
1398    vldrepl.b      vr5,     t1,      0
1399    ldx.bu         t3,      t1,      a1
1400    vreplgr2vr.b   vr6,     t3
1401
1402    vsllwil.hu.bu  vr0,     vr0,     0
1403    vsllwil.hu.bu  vr1,     vr1,     0
1404    vsllwil.hu.bu  vr2,     vr2,     0
1405    vsllwil.hu.bu  vr3,     vr3,     0
1406    vsllwil.hu.bu  vr4,     vr4,     0
1407    vsllwil.hu.bu  vr5,     vr5,     0
1408    vsllwil.hu.bu  vr6,     vr6,     0
1409.endm
1410
1411.macro ipred_filter_load_fltptr
1412    fld.d          f7,      a6,      0
1413    fld.d          f8,      a6,      8
1414    fld.d          f9,      a6,      16
1415    fld.d          f10,     a6,      24
1416    fld.d          f11,     a6,      32
1417    fld.d          f12,     a6,      40
1418    fld.d          f13,     a6,      48
1419
1420    vsllwil.h.b    vr7,     vr7,     0
1421    vsllwil.h.b    vr8,     vr8,     0
1422    vsllwil.h.b    vr9,     vr9,     0
1423    vsllwil.h.b    vr10,    vr10,    0
1424    vsllwil.h.b    vr11,    vr11,    0
1425    vsllwil.h.b    vr12,    vr12,    0
1426    vsllwil.h.b    vr13,    vr13,    0
1427.endm
1428
1429.macro ipred_filter_calc_acc
1430    vmul.h         vr7,     vr7,     vr0
1431    vmadd.h        vr7,     vr8,     vr1
1432    vmadd.h        vr7,     vr9,     vr2
1433    vmadd.h        vr7,     vr10,    vr3
1434    vmadd.h        vr7,     vr11,    vr4
1435    vmadd.h        vr7,     vr12,    vr5
1436    vmadd.h        vr7,     vr13,    vr6
1437    vaddi.hu       vr7,     vr7,     8
1438    vsrai.h        vr7,     vr7,     4
1439    iclip_pixel_vrh vr7, vr14, vr15, vr9, vr10, vr8
1440    vsrlni.b.h     vr8,     vr8,     0
1441.endm
1442
1443// void ipred_filter_lsx(pixel *dst, const ptrdiff_t stride,
1444//                       const pixel *const topleft_in,
1445//                       const int width, const int height, int filt_idx,
1446//                       const int max_width, const int max_height
1447//                       HIGHBD_DECL_SUFFIX)
1448function ipred_filter_8bpc_lsx
1449    andi           a5,      a5,      511
1450    la.local       a6,      dav1d_filter_intra_taps_lsx
1451    li.w           a7,      56
1452    mul.w          a7,      a7,      a5
1453    add.d          a6,      a6,      a7   //*filter
1454    addi.d         a7,      a2,      1    //*top
1455    or             a5,      zero,    zero //y
1456    vxor.v         vr14,    vr14,    vr14
1457    li.w           t0,      255
1458    vreplgr2vr.h   vr15,    t0
1459
1460.FILTER_LOOP_H:
1461    sub.d          t0,      a2,      a5   //*topleft
1462    addi.d         t1,      t0,      -1   //left
1463
1464    ctz.w          t2,      a3
1465    addi.d         t3,      t2,      -2
1466    beqz           t3,      .FILTER_LOOP_W4
1467    addi.d         t3,      t2,      -3
1468    beqz           t3,      .FILTER_LOOP_W8
1469    addi.d         t3,      t2,      -4
1470    beqz           t3,      .FILTER_LOOP_W16
1471    addi.d         t3,      t2,      -5
1472    beqz           t3,      .FILTER_LOOP_W32
1473
1474.FILTER_LOOP_W4:
1475    ipred_filter_load_p
1476
1477    or             t3,      a0,      a0  //*ptr
1478
1479    ipred_filter_load_fltptr
1480    ipred_filter_calc_acc
1481
1482    fst.s          f8,      t3,      0
1483    add.d          t3,      t3,      a1
1484    vstelm.w       vr8,     t3,      0,      1
1485    add.d          t3,      t3,      a1
1486
1487    b              .FILTER_LOOP_W_END
1488
1489.FILTER_LOOP_W8:
1490    ipred_filter_load_p
1491
1492    or             t3,      a0,      a0
1493
1494    ipred_filter_load_fltptr
1495    ipred_filter_calc_acc
1496
1497    fst.s          f8,      t3,      0
1498    add.d          t3,      t3,      a1
1499    vstelm.w       vr8,     t3,      0,      1
1500    add.d          t3,      t3,      a1
1501
1502    addi.d         t1,      a0,      3
1503    addi.d         a7,      a7,      4
1504    addi.d         t0,      a7,      -1
1505
1506    ipred_filter_loadx_p
1507
1508    addi.d         t3,      a0,      4
1509
1510    ipred_filter_load_fltptr
1511    ipred_filter_calc_acc
1512
1513    fst.s          f8,      t3,      0
1514    add.d          t3,      t3,      a1
1515    vstelm.w       vr8,     t3,      0,      1
1516    add.d          t3,      t3,      a1
1517
1518    b              .FILTER_LOOP_W_END
1519
1520.FILTER_LOOP_W16:
1521    ipred_filter_load_p
1522
1523    or             t3,      a0,      a0
1524
1525    ipred_filter_load_fltptr
1526    ipred_filter_calc_acc
1527
1528    fst.s          f8,      t3,      0
1529    add.d          t3,      t3,      a1
1530    vstelm.w       vr8,     t3,      0,      1
1531    add.d          t3,      t3,      a1
1532
1533    addi.d         t1,      a0,      3
1534    addi.d         a7,      a7,      4
1535    addi.d         t0,      a7,      -1
1536
1537    ipred_filter_loadx_p
1538
1539    addi.d         t3,      a0,      4
1540
1541    ipred_filter_load_fltptr
1542    ipred_filter_calc_acc
1543
1544    fst.s          f8,      t3,      0
1545    add.d          t3,      t3,      a1
1546    vstelm.w       vr8,     t3,      0,      1
1547    add.d          t3,      t3,      a1
1548
1549    addi.d         t1,      a0,      7
1550    addi.d         a7,      a7,      4
1551    addi.d         t0,      a7,      -1
1552
1553    ipred_filter_loadx_p
1554
1555    addi.d         t3,      a0,      8
1556
1557    ipred_filter_load_fltptr
1558    ipred_filter_calc_acc
1559
1560    fst.s          f8,      t3,      0
1561    add.d          t3,      t3,      a1
1562    vstelm.w       vr8,     t3,      0,      1
1563    add.d          t3,      t3,      a1
1564
1565    addi.d         t1,      a0,      11
1566    addi.d         a7,      a7,      4
1567    addi.d         t0,      a7,      -1
1568
1569    ipred_filter_loadx_p
1570
1571    addi.d         t3,      a0,      12
1572
1573    ipred_filter_load_fltptr
1574    ipred_filter_calc_acc
1575
1576    fst.s          f8,      t3,      0
1577    add.d          t3,      t3,      a1
1578    vstelm.w       vr8,     t3,      0,      1
1579    add.d          t3,      t3,      a1
1580
1581    b              .FILTER_LOOP_W_END
1582
1583.FILTER_LOOP_W32:
1584    ipred_filter_load_p
1585
1586    or             t3,      a0,      a0
1587
1588    ipred_filter_load_fltptr
1589    ipred_filter_calc_acc
1590
1591    fst.s          f8,      t3,      0
1592    add.d          t3,      t3,      a1
1593    vstelm.w       vr8,     t3,      0,      1
1594    add.d          t3,      t3,      a1
1595
1596    addi.d         t1,      a0,      3
1597    addi.d         a7,      a7,      4
1598    addi.d         t0,      a7,      -1
1599
1600    ipred_filter_loadx_p
1601
1602    addi.d         t3,      a0,      4
1603
1604    ipred_filter_load_fltptr
1605    ipred_filter_calc_acc
1606
1607    fst.s          f8,      t3,      0
1608    add.d          t3,      t3,      a1
1609    vstelm.w       vr8,     t3,      0,      1
1610    add.d          t3,      t3,      a1
1611
1612    addi.d         t1,      a0,      7
1613    addi.d         a7,      a7,      4
1614    addi.d         t0,      a7,      -1
1615
1616    ipred_filter_loadx_p
1617
1618    addi.d         t3,      a0,      8
1619
1620    ipred_filter_load_fltptr
1621    ipred_filter_calc_acc
1622
1623    fst.s          f8,      t3,      0
1624    add.d          t3,      t3,      a1
1625    vstelm.w       vr8,     t3,      0,      1
1626    add.d          t3,      t3,      a1
1627
1628    addi.d         t1,      a0,      11
1629    addi.d         a7,      a7,      4
1630    addi.d         t0,      a7,      -1
1631
1632    ipred_filter_loadx_p
1633
1634    addi.d         t3,      a0,      12
1635
1636    ipred_filter_load_fltptr
1637    ipred_filter_calc_acc
1638
1639    fst.s          f8,      t3,      0
1640    add.d          t3,      t3,      a1
1641    vstelm.w       vr8,     t3,      0,      1
1642    add.d          t3,      t3,      a1
1643
1644    addi.d         t1,      a0,      15
1645    addi.d         a7,      a7,      4
1646    addi.d         t0,      a7,      -1
1647
1648    ipred_filter_loadx_p
1649
1650    addi.d         t3,      a0,      16
1651
1652    ipred_filter_load_fltptr
1653    ipred_filter_calc_acc
1654
1655    fst.s          f8,      t3,      0
1656    add.d          t3,      t3,      a1
1657    vstelm.w       vr8,     t3,      0,      1
1658    add.d          t3,      t3,      a1
1659
1660    addi.d         t1,      a0,      19
1661    addi.d         a7,      a7,      4
1662    addi.d         t0,      a7,      -1
1663
1664    ipred_filter_loadx_p
1665
1666    addi.d         t3,      a0,      20
1667
1668    ipred_filter_load_fltptr
1669    ipred_filter_calc_acc
1670
1671    fst.s          f8,      t3,      0
1672    add.d          t3,      t3,      a1
1673    vstelm.w       vr8,     t3,      0,      1
1674    add.d          t3,      t3,      a1
1675
1676    addi.d         t1,      a0,      23
1677    addi.d         a7,      a7,      4
1678    addi.d         t0,      a7,      -1
1679
1680    ipred_filter_loadx_p
1681
1682    addi.d         t3,      a0,      24
1683
1684    ipred_filter_load_fltptr
1685    ipred_filter_calc_acc
1686
1687    fst.s          f8,      t3,      0
1688    add.d          t3,      t3,      a1
1689    vstelm.w       vr8,     t3,      0,      1
1690    add.d          t3,      t3,      a1
1691
1692    addi.d         t1,      a0,      27
1693    addi.d         a7,      a7,      4
1694    addi.d         t0,      a7,      -1
1695
1696    ipred_filter_loadx_p
1697
1698    addi.d         t3,      a0,      28
1699
1700    ipred_filter_load_fltptr
1701    ipred_filter_calc_acc
1702
1703    fst.s          f8,      t3,      0
1704    add.d          t3,      t3,      a1
1705    vstelm.w       vr8,     t3,      0,      1
1706    add.d          t3,      t3,      a1
1707
1708.FILTER_LOOP_W_END:
1709    add.d          a7,      a0,      a1
1710    add.d          t2,      a1,      a1
1711    add.d          a0,      a0,      t2
1712    addi.d         a5,      a5,      2
1713    blt            a5,      a4,      .FILTER_LOOP_H
1714endfunc
1715
1716const dav1d_dr_intra_derivative
1717    // Values that are 0 will never be used
1718    .short  0         // Angles:
1719    .short  1023, 0   //  3,  93, 183
1720    .short  547       //  6,  96, 186
1721    .short  372, 0, 0 //  9,  99, 189
1722    .short  273       // 14, 104, 194
1723    .short  215, 0    // 17, 107, 197
1724    .short  178       // 20, 110, 200
1725    .short  151, 0    // 23, 113, 203 (113 & 203 are base angles)
1726    .short  132       // 26, 116, 206
1727    .short  116, 0    // 29, 119, 209
1728    .short  102, 0    // 32, 122, 212
1729    .short  90        // 36, 126, 216
1730    .short  80, 0     // 39, 129, 219
1731    .short  71        // 42, 132, 222
1732    .short  64, 0     // 45, 135, 225 (45 & 135 are base angles)
1733    .short  57        // 48, 138, 228
1734    .short  51, 0     // 51, 141, 231
1735    .short  45, 0     // 54, 144, 234
1736    .short  40        // 58, 148, 238
1737    .short  35, 0     // 61, 151, 241
1738    .short  31        // 64, 154, 244
1739    .short  27, 0     // 67, 157, 247 (67 & 157 are base angles)
1740    .short  23        // 70, 160, 250
1741    .short  19, 0     // 73, 163, 253
1742    .short  15, 0     // 76, 166, 256
1743    .short  11, 0     // 81, 171, 261
1744    .short  7         // 84, 174, 264
1745    .short  3         // 87, 177, 267
1746endconst
1747
1748const z1_upsample_edge_kernel
1749    .short  -1, 9, 9, -1, -1, 9, 9, -1
1750endconst
1751
1752const ipred_filter_edge_kernel1
1753    .short  0, 4, 8, 4, 0, 4, 8, 4
1754    .short  0, 5, 6, 5, 0, 5, 6, 5
1755    .short  2, 4, 4, 4, 2, 4, 4, 4
1756endconst
1757
1758const ipred_filter_edge_kernel2
1759    .short  0, 0, 0, 0, 0, 0, 0, 0
1760    .short  0, 0, 0, 0, 0, 0, 0, 0
1761    .short  2, 2, 2, 2, 2, 2, 2, 2
1762endconst
1763
1764.macro z1_upsample_edge_calc_loop
1765    vsllwil.hu.bu  vr10,    vr7,     0
1766    vsllwil.hu.bu  vr11,    vr11,    0
1767    vsllwil.hu.bu  vr12,    vr12,    0
1768    vsllwil.hu.bu  vr13,    vr13,    0
1769
1770    vmul.h         vr10,    vr10,    vr0
1771    vmul.h         vr11,    vr11,    vr0
1772    vmul.h         vr12,    vr12,    vr0
1773    vmul.h         vr13,    vr13,    vr0
1774
1775    vhaddw.w.h     vr10,    vr10,    vr10
1776    vhaddw.w.h     vr11,    vr11,    vr11
1777    vhaddw.w.h     vr12,    vr12,    vr12
1778    vhaddw.w.h     vr13,    vr13,    vr13
1779    vhaddw.d.w     vr10,    vr10,    vr10
1780    vhaddw.d.w     vr11,    vr11,    vr11
1781    vhaddw.d.w     vr12,    vr12,    vr12
1782    vhaddw.d.w     vr13,    vr13,    vr13
1783
1784    vpackev.h      vr10,    vr11,    vr10
1785    vpackev.h      vr11,    vr13,    vr12
1786    vpackev.w      vr12,    vr11,    vr10  //s:01234567
1787    vsrari.h       vr12,    vr12,    4
1788    iclip_pixel_vrh vr12, vr15, vr16, vr10, vr11, vr12
1789    vsrlni.b.h     vr12,    vr12,    0  //out: 13579...
1790    vbsrl.v        vr11,    vr7,     1  //out:02468...
1791    vilvl.b        vr13,    vr12,    vr11
1792.endm
1793
1794.macro z1_upsample_edge_data_init1
1795    vbsrl.v        vr11,    vr7,     1
1796    vbsrl.v        vr12,    vr7,     2
1797    vbsrl.v        vr13,    vr7,     3
1798    z1_upsample_edge_calc_loop
1799.endm
1800
1801.macro z1_upsample_edge_data_init2
1802    vbsrl.v        vr11,    vr7,     1
1803    vbsrl.v        vr12,    vr7,     2
1804    vextrins.b     vr12,    vr12,    0x76
1805    vbsrl.v        vr13,    vr7,     3
1806    vextrins.b     vr13,    vr13,    0x65
1807    vextrins.b     vr13,    vr13,    0x75
1808    z1_upsample_edge_calc_loop
1809.endm
1810
1811.macro z1_upsample_edge_calc_other
1812    vsllwil.hu.bu  vr10,    vr7,     0
1813    vmul.h         vr10,    vr10,    vr0
1814    vhaddw.w.h     vr10,    vr10,    vr10
1815    vhaddw.d.w     vr10,    vr10,    vr10
1816    vreplvei.h     vr12,    vr10,    0   //s0-s7
1817    vsrari.h       vr12,    vr12,    4
1818
1819    iclip_pixel_vrh vr12, vr15, vr16, vr10, vr11, vr12
1820    vsrlni.b.h     vr12,    vr12,    0
1821    vilvl.b        vr13,    vr12,    vr7
1822.endm
1823
1824.macro z1_filter_edge_calc_loop1
1825    vmul.h         vr10,    vr10,    vr1
1826    vmul.h         vr11,    vr11,    vr1
1827    vmul.h         vr12,    vr12,    vr1
1828    vmul.h         vr13,    vr13,    vr1
1829
1830    vhaddw.w.h     vr10,    vr10,    vr10
1831    vhaddw.w.h     vr11,    vr11,    vr11
1832    vhaddw.w.h     vr12,    vr12,    vr12
1833    vhaddw.w.h     vr13,    vr13,    vr13
1834    vhaddw.d.w     vr10,    vr10,    vr10
1835    vhaddw.d.w     vr11,    vr11,    vr11
1836    vhaddw.d.w     vr12,    vr12,    vr12
1837    vhaddw.d.w     vr13,    vr13,    vr13
1838
1839    vpackev.h      vr10,    vr11,    vr10
1840    vpackev.h      vr11,    vr13,    vr12
1841    vpackev.w      vr10,    vr11,    vr10  //s:01234567
1842.endm
1843
1844.macro z1_filter_edge_calc_loop2
1845    vsllwil.hu.bu  vr13,    vr13,    0
1846    vmadd.h        vr10,    vr13,    vr6
1847    vsrari.h       vr12,    vr10,    4
1848    vsrlni.b.h     vr12,    vr12,    0  //out: 0-7
1849.endm
1850
1851.macro z1_filter_edge_calc_other
1852    vsllwil.hu.bu  vr10,    vr10,    0
1853    vmul.h         vr11,    vr10,    vr1
1854    vhaddw.w.h     vr11,    vr11,    vr11
1855    vhaddw.d.w     vr11,    vr11,    vr11
1856    vreplvei.h     vr12,    vr11,    4
1857    vextrins.h     vr12,    vr11,    0x00
1858
1859    vreplvei.h     vr13,    vr10,    1
1860    vmadd.h        vr12,    vr13,    vr6
1861    vsrari.h       vr12,    vr12,    4
1862    vsrlni.b.h     vr12,    vr12,    0  //out: 0-7
1863.endm
1864
1865.macro z1_filter_edge_data_init1
1866    vbsll.v        vr10,    vr7,     1
1867    vextrins.b     vr10,    vr10,    0x01
1868    vbsrl.v        vr12,    vr7,     1
1869    vbsrl.v        vr13,    vr7,     2
1870    vsllwil.hu.bu  vr10,    vr10,    0
1871    vsllwil.hu.bu  vr11,    vr7,     0
1872    vsllwil.hu.bu  vr12,    vr12,    0
1873    vsllwil.hu.bu  vr13,    vr13,    0
1874    z1_filter_edge_calc_loop1
1875.endm
1876
1877.macro z1_filter_edge_data_init2
1878    vbsrl.v        vr11,    vr7,     1
1879    vbsrl.v        vr12,    vr7,     2
1880    vbsrl.v        vr13,    vr7,     3
1881    vsllwil.hu.bu  vr10,    vr7,     0
1882    vsllwil.hu.bu  vr11,    vr11,    0
1883    vsllwil.hu.bu  vr12,    vr12,    0
1884    vsllwil.hu.bu  vr13,    vr13,    0
1885    z1_filter_edge_calc_loop1
1886.endm
1887
1888.macro z1_filter_edge_data_init3
1889    vbsrl.v        vr11,    vr7,     1
1890    vbsrl.v        vr12,    vr7,     2
1891    vbsrl.v        vr13,    vr7,     3
1892    vextrins.b     vr13,    vr13,    0x76
1893    vsllwil.hu.bu  vr10,    vr7,     0
1894    vsllwil.hu.bu  vr11,    vr11,    0
1895    vsllwil.hu.bu  vr12,    vr12,    0
1896    vsllwil.hu.bu  vr13,    vr13,    0
1897    z1_filter_edge_calc_loop1
1898.endm
1899
1900.macro z1_filter_edge_data_init4
1901    vbsll.v        vr10,    vr7,     1
1902    vextrins.b     vr10,    vr10,    0x01
1903    vbsrl.v        vr12,    vr7,     1
1904    vbsrl.v        vr13,    vr7,     2
1905    vextrins.b     vr13,    vr13,    0x76
1906    vsllwil.hu.bu  vr10,    vr10,    0
1907    vsllwil.hu.bu  vr11,    vr7,     0
1908    vsllwil.hu.bu  vr12,    vr12,    0
1909    vsllwil.hu.bu  vr13,    vr13,    0
1910    z1_filter_edge_calc_loop1
1911.endm
1912
1913.macro pixel_set_8bpc_allw dst_ptr, src_ptr, width, tmp0, tmp1
1914    vldrepl.b      vr10,    \src_ptr, 0
1915    or             \tmp1,   zero,     zero
1916    srai.d         \tmp0,   \width,   4
1917    beqz           \tmp0,   2f
19181:
1919    vstx           vr10,    \dst_ptr, \tmp1
1920    addi.d         \tmp1,   \tmp1,    16
1921    addi.d         \tmp0,   \tmp0,    -1
1922    bnez           \tmp0,   1b
19232:
1924    andi           \tmp0,   \width,   8
1925    beqz           \tmp0,   3f
1926    fstx.d         f10,     \dst_ptr, \tmp1
1927    addi.d         \tmp1,   \tmp1,    8
19283:
1929    andi           \tmp0,   \width,   4
1930    beqz           \tmp0,   4f
1931    fstx.s         f10,     \dst_ptr, \tmp1
1932    addi.d         \tmp1,   \tmp1,    4
19334:
1934    andi           \tmp0,   \width,   2
1935    beqz           \tmp0,   5f
1936    ldx.bu         \tmp0,   \src_ptr, zero
1937    stx.b          \tmp0,   \dst_ptr, \tmp1
1938    addi.d         \tmp1,   \tmp1,    1
1939    stx.b          \tmp0,   \dst_ptr, \tmp1
1940    addi.d         \tmp1,   \tmp1,    1
19415:
1942    andi           \tmp0,   \width,   1
1943    beqz           \tmp0,   6f
1944    ldx.bu         \tmp0,   \src_ptr, zero
1945    stx.b          \tmp0,   \dst_ptr, \tmp1
19466:
1947.endm
1948
1949// void ipred_z1_lsx(pixel *dst, const ptrdiff_t stride,
1950//                   const pixel *const topleft_in,
1951//                   const int width, const int height, int angle,
1952//                   const int max_width, const int max_height
1953//                   HIGHBD_DECL_SUFFIX)
1954function ipred_z1_8bpc_lsx
1955    addi.d         a2,      a2,      1   //&topleft_in[1]
1956    addi.d         sp,      sp,      -128
1957    or             t2,      sp,      sp  //top_out
1958    srai.d         a6,      a5,      9
1959    andi           a6,      a6,      1   //is_sum
1960    srai.d         a7,      a5,      10  //enable_intra_edge_filter
1961    andi           a5,      a5,      511
1962
1963    la.local       t0,      dav1d_dr_intra_derivative
1964    andi           t1,      a5,      0xFFE
1965    ldx.hu         t1,      t0,      t1  //dx
1966
1967    beqz           a7,      .IPRED_Z1_NOTUA
1968    add.d          t3,      a3,      a4
1969    li.w           t4,      90
1970    sub.w          t4,      t4,      a5
1971    // ipred_get_upsample t5:upsample_above
1972    li.w           t6,      16
1973    sra.d          t6,      t6,      a6
1974    bge            t6,      t3,      .Z1_GETUS1
1975    addi.d         t5,      zero,    0
1976    b              .Z1_GETUS2
1977.Z1_GETUS1:
1978    addi.d         t5,      zero,    1
1979.Z1_GETUS2:
1980    li.w           t6,      40
1981    blt            t4,      t6,      .Z1_GETUS3
1982    addi.d         t6,      zero,    0
1983    b              .Z1_GETUS4
1984.Z1_GETUS3:
1985    addi.d         t6,      zero,    1
1986.Z1_GETUS4:
1987    and            t5,      t5,      t6
1988
1989    beqz           t5,      .IPRED_Z1_NOTUA
1990
1991    la.local       t0,      z1_upsample_edge_kernel
1992    vld            vr0,     t0,      0   //kernel
1993    vxor.v         vr15,    vr15,    vr15
1994    li.w           t0,      255
1995    vreplgr2vr.h   vr16,    t0
1996
1997.Z1_UEDGE_W4:
1998    andi           t6,      a3,     4
1999    beqz           t6,      .Z1_UEDGE_W8
2000.Z1_UEDGE_W4_H4:
2001    andi           t6,      a4,     4
2002    beqz           t6,      .Z1_UEDGE_W4_H8
2003
2004    //0-6
2005    vld            vr7,     a2,      -1
2006    vbsrl.v        vr11,    vr7,     1
2007    vbsrl.v        vr12,    vr7,     2
2008    vextrins.b     vr12,    vr12,    0x76
2009    vbsrl.v        vr13,    vr7,     3
2010    z1_upsample_edge_calc_loop
2011
2012    fst.d          f13,     t2,     0
2013    vstelm.w       vr13,    t2,     8,    2
2014    vstelm.h       vr13,    t2,     12,   6
2015
2016    ld.bu          t7,      a2,     7
2017    st.b           t7,      t2,     14
2018
2019    b              .Z1_UEDGE_END
2020
2021.Z1_UEDGE_W4_H8:
2022    andi           t6,      a4,     8
2023    beqz           t6,      .Z1_UEDGE_W4_H16
2024
2025    //0-7
2026    vld            vr7,     a2,      -1
2027    z1_upsample_edge_data_init2
2028    vst            vr13,    t2,     0
2029
2030    //8-10
2031    vldrepl.b      vr7,     a2,     7
2032    z1_upsample_edge_calc_other
2033
2034    vstelm.w       vr13,    t2,     16,   0
2035    vstelm.h       vr13,    t2,     20,   2
2036
2037    ld.bu          t7,      a2,     7
2038    st.b           t7,      t2,     22
2039
2040    b              .Z1_UEDGE_END
2041
2042.Z1_UEDGE_W4_H16:
2043    andi           t6,      a4,     16
2044    beqz           t6,      .Z1_UEDGE_W4_H32
2045
2046    //0-7
2047    vld            vr7,     a2,      -1
2048    z1_upsample_edge_data_init2
2049    vst            vr13,    t2,     0
2050
2051    //8-15
2052    vldrepl.b      vr7,     a2,     7
2053    z1_upsample_edge_calc_other
2054    vst            vr13,    t2,     16
2055
2056    //16-18
2057    vstelm.w       vr13,    t2,     32,   0
2058    vstelm.h       vr13,    t2,     36,   2
2059
2060    ld.bu          t7,      a2,     7
2061    st.b           t7,      t2,     38
2062
2063    b              .Z1_UEDGE_END
2064
2065.Z1_UEDGE_W4_H32:
2066    andi           t6,      a4,     32
2067    beqz           t6,      .Z1_UEDGE_W4_H64
2068
2069    //0-7
2070    vld            vr7,     a2,      -1
2071    z1_upsample_edge_data_init2
2072    vst            vr13,    t2,     0
2073
2074    //8-15
2075    vldrepl.b      vr7,     a2,     7
2076    z1_upsample_edge_calc_other
2077    vst            vr13,    t2,     16
2078
2079    vst            vr13,    t2,     32 //16-23
2080    vst            vr13,    t2,     48 //24-31
2081
2082    //32-34
2083    vstelm.w       vr13,    t2,     64,   0
2084    vstelm.h       vr13,    t2,     68,   2
2085
2086    ld.bu          t7,      a2,     7
2087    st.b           t7,      t2,     70
2088
2089    b              .Z1_UEDGE_END
2090
2091.Z1_UEDGE_W4_H64:
2092    //0-7
2093    vld            vr7,     a2,      -1
2094    z1_upsample_edge_data_init2
2095    vst            vr13,    t2,     0
2096
2097    //8-15
2098    vldrepl.b      vr7,     a2,     7
2099    z1_upsample_edge_calc_other
2100    vst            vr13,    t2,     16
2101
2102    vst            vr13,    t2,     32 //16-23
2103    vst            vr13,    t2,     48 //24-31
2104    vst            vr13,    t2,     64 //32-39
2105    vst            vr13,    t2,     80 //40-47
2106    vst            vr13,    t2,     96 //48-55
2107    vst            vr13,    t2,     112 //56-63
2108
2109    //64-66
2110    vstelm.w       vr13,    t2,     128,   0
2111    vstelm.h       vr13,    t2,     132,   2
2112
2113    ld.bu          t7,      a2,     7
2114    st.b           t7,      t2,     134
2115
2116    b              .Z1_UEDGE_END
2117
2118.Z1_UEDGE_W8:
2119    andi           t6,      a3,     8
2120    beqz           t6,      .Z1_UEDGE_W16
2121.Z1_UEDGE_W8_H4:
2122    andi           t6,      a4,     4
2123    beqz           t6,      .Z1_UEDGE_W8_H8
2124
2125    //0-7
2126    vld            vr7,     a2,      -1
2127    z1_upsample_edge_data_init1
2128    vst            vr13,    t2,     0
2129
2130    //8-15
2131    vld            vr7,     a2,      7
2132    vbsrl.v        vr11,    vr7,     1
2133    vbsrl.v        vr12,    vr7,     2
2134    vextrins.b     vr12,    vr12,    0x32
2135    vbsrl.v        vr13,    vr7,     3
2136    vextrins.b     vr13,    vr13,    0x21
2137    vextrins.b     vr13,    vr13,    0x31
2138    z1_upsample_edge_calc_loop
2139    vstelm.w       vr13,    t2,     16,    0
2140    vstelm.h       vr13,    t2,     20,    2
2141
2142    ld.bu          t7,      a2,     11
2143    st.b           t7,      t2,     22
2144    b              .Z1_UEDGE_END
2145
2146.Z1_UEDGE_W8_H8:
2147    andi           t6,      a4,     8
2148    beqz           t6,      .Z1_UEDGE_W8_H16
2149
2150    //0-7
2151    vld            vr7,     a2,      -1
2152    z1_upsample_edge_data_init1
2153    vst            vr13,    t2,     0
2154
2155    //8-14
2156    vld            vr7,     a2,      7
2157    vbsrl.v        vr11,    vr7,     1
2158    vbsrl.v        vr12,    vr7,     2
2159    vextrins.b     vr12,    vr12,    0x76
2160    vbsrl.v        vr13,    vr7,     3
2161    z1_upsample_edge_calc_loop
2162    fst.d          f13,     t2,     16
2163    vstelm.w       vr13,    t2,     24,    2
2164    vstelm.h       vr13,    t2,     28,    6
2165
2166    ld.bu          t7,      a2,     15
2167    st.b           t7,      t2,     30
2168    b              .Z1_UEDGE_END
2169
2170.Z1_UEDGE_W8_H16:
2171    andi           t6,      a4,     16
2172    beqz           t6,      .Z1_UEDGE_W8_H32
2173
2174    //0-7
2175    vld            vr7,     a2,      -1
2176    z1_upsample_edge_data_init1
2177    vst            vr13,    t2,     0
2178
2179    //8-15
2180    vld            vr7,     a2,      7
2181    z1_upsample_edge_data_init2
2182    vst            vr13,    t2,     16
2183
2184    //16-22
2185    vldrepl.b      vr7,     a2,     15
2186    z1_upsample_edge_calc_other
2187    fst.d          f13,     t2,     32
2188    vstelm.w       vr13,    t2,     40,   2
2189    vstelm.h       vr13,    t2,     44,   6
2190
2191    ld.bu          t7,      a2,     15
2192    st.b           t7,      t2,     46
2193    b              .Z1_UEDGE_END
2194
2195.Z1_UEDGE_W8_H32:
2196    andi           t6,      a4,     32
2197    beqz           t6,      .Z1_UEDGE_W8_H64
2198
2199    //0-7
2200    vld            vr7,     a2,      -1
2201    z1_upsample_edge_data_init1
2202    vst            vr13,    t2,     0
2203
2204    //8-15
2205    vld            vr7,     a2,      7
2206    z1_upsample_edge_data_init2
2207    vst            vr13,    t2,     16
2208
2209    //16-23
2210    vldrepl.b      vr7,     a2,     15
2211    z1_upsample_edge_calc_other
2212    vst            vr13,    t2,     32
2213
2214    vst            vr13,    t2,     48 //24-31
2215
2216    //32-38
2217    fst.d          f13,     t2,     64
2218    vstelm.w       vr13,    t2,     72,   2
2219    vstelm.h       vr13,    t2,     76,   6
2220
2221    ld.bu          t7,      a2,     15
2222    st.b           t7,      t2,     78
2223    b              .Z1_UEDGE_END
2224
2225.Z1_UEDGE_W8_H64:
2226    //0-7
2227    vld            vr7,     a2,      -1
2228    z1_upsample_edge_data_init1
2229    vst            vr13,    t2,     0
2230
2231    //8-15
2232    vld            vr7,     a2,      7
2233    z1_upsample_edge_data_init2
2234    vst            vr13,    t2,     16
2235
2236    //16-23
2237    vldrepl.b      vr7,     a2,     15
2238    z1_upsample_edge_calc_other
2239    vst            vr13,    t2,     32
2240
2241    vst            vr13,    t2,     48 //24-31
2242    vst            vr13,    t2,     64 //32-39
2243    vst            vr13,    t2,     80 //40-47
2244    vst            vr13,    t2,     96 //48-55
2245    vst            vr13,    t2,     112 //56-63
2246
2247    //64-70
2248    fst.d          f13,     t2,     128
2249    vstelm.w       vr13,    t2,     136,   2
2250    vstelm.h       vr13,    t2,     140,   6
2251
2252    ld.bu          t7,      a2,     15
2253    st.b           t7,      t2,     142
2254    b              .Z1_UEDGE_END
2255
2256.Z1_UEDGE_W16:
2257    andi           t6,      a3,     16
2258    beqz           t6,      .Z1_UEDGE_W32
2259.Z1_UEDGE_W16_H4:
2260    andi           t6,      a4,     4
2261    beqz           t6,      .Z1_UEDGE_W16_H8
2262
2263    //0-7
2264    vld            vr7,     a2,      -1
2265    z1_upsample_edge_data_init1
2266    vst            vr13,    t2,     0
2267
2268    //8-15
2269    vld            vr7,     a2,      7
2270    z1_upsample_edge_data_init1
2271    vst            vr13,    t2,     16
2272
2273    //16-18
2274    vld            vr7,     a2,      15
2275    z1_upsample_edge_data_init1
2276    vstelm.w       vr13,    t2,     32,    0
2277    vstelm.h       vr13,    t2,     36,    2
2278
2279    ld.bu          t7,      a2,     19
2280    st.b           t7,      t2,     38
2281    b              .Z1_UEDGE_END
2282
2283.Z1_UEDGE_W16_H8:
2284    andi           t6,      a4,     8
2285    beqz           t6,      .Z1_UEDGE_W16_H16
2286
2287    //0-7
2288    vld            vr7,     a2,      -1
2289    z1_upsample_edge_data_init1
2290    vst            vr13,    t2,     0
2291
2292    //8-15
2293    vld            vr7,     a2,      7
2294    z1_upsample_edge_data_init1
2295    vst            vr13,    t2,      16
2296
2297    //16-22
2298    vld            vr7,     a2,      15
2299    vbsrl.v        vr11,    vr7,     1
2300    vbsrl.v        vr12,    vr7,     2
2301    vextrins.b     vr12,    vr12,    0x76
2302    vbsrl.v        vr13,    vr7,     3
2303    z1_upsample_edge_calc_loop
2304    fst.d          f13,     t2,     32
2305    vstelm.w       vr13,    t2,     40,    2
2306    vstelm.h       vr13,    t2,     44,    6
2307
2308    ld.bu          t7,      a2,     23
2309    st.b           t7,      t2,     46
2310    b              .Z1_UEDGE_END
2311
2312.Z1_UEDGE_W16_H16:
2313    andi           t6,      a4,     16
2314    beqz           t6,      .Z1_UEDGE_W16_H32
2315
2316    //0-7
2317    vld            vr7,     a2,      -1
2318    z1_upsample_edge_data_init1
2319    vst            vr13,    t2,     0
2320
2321    //8-15
2322    vld            vr7,     a2,      7
2323    z1_upsample_edge_data_init1
2324    vst            vr13,    t2,     16
2325
2326    //16-23
2327    vld            vr7,     a2,      15
2328    z1_upsample_edge_data_init1
2329    vst            vr13,    t2,      32
2330
2331    //24-30
2332    vld            vr7,     a2,      23
2333    vbsrl.v        vr11,    vr7,     1
2334    vbsrl.v        vr12,    vr7,     2
2335    vextrins.b     vr12,    vr12,    0x76
2336    vbsrl.v        vr13,    vr7,     3
2337    z1_upsample_edge_calc_loop
2338    fst.d          f13,     t2,     48
2339    vstelm.w       vr13,    t2,     56,    2
2340    vstelm.h       vr13,    t2,     60,    6
2341
2342    ld.bu          t7,      a2,     31
2343    st.b           t7,      t2,     62
2344    b              .Z1_UEDGE_END
2345
2346.Z1_UEDGE_W16_H32:
2347    andi           t6,      a4,     32
2348    beqz           t6,      .Z1_UEDGE_W16_H64
2349
2350    //0-7
2351    vld            vr7,     a2,      -1
2352    z1_upsample_edge_data_init1
2353    vst            vr13,    t2,     0
2354
2355    //8-15
2356    vld            vr7,     a2,      7
2357    z1_upsample_edge_data_init1
2358    vst            vr13,    t2,     16
2359
2360    //16-23
2361    vld            vr7,     a2,      15
2362    z1_upsample_edge_data_init1
2363    vst            vr13,    t2,      32
2364
2365    //24-31
2366    vld            vr7,     a2,      23
2367    z1_upsample_edge_data_init2
2368    vst            vr13,    t2,      48
2369
2370    //32-39
2371    vldrepl.b      vr7,     a2,      31
2372    z1_upsample_edge_calc_other
2373    vst            vr13,    t2,      64
2374
2375    //40-46
2376    fst.d          f13,     t2,     80
2377    vstelm.w       vr13,    t2,     88,    2
2378    vstelm.h       vr13,    t2,     92,    6
2379
2380    ld.bu          t7,      a2,     31
2381    st.b           t7,      t2,     94
2382    b              .Z1_UEDGE_END
2383
2384.Z1_UEDGE_W16_H64:
2385    //0-7
2386    vld            vr7,     a2,      -1
2387    z1_upsample_edge_data_init1
2388    vst            vr13,    t2,     0
2389
2390    //8-15
2391    vld            vr7,     a2,      7
2392    z1_upsample_edge_data_init1
2393    vst            vr13,    t2,     16
2394
2395    //16-23
2396    vld            vr7,     a2,      15
2397    z1_upsample_edge_data_init1
2398    vst            vr13,    t2,      32
2399
2400    //24-31
2401    vld            vr7,     a2,      23
2402    z1_upsample_edge_data_init2
2403    vst            vr13,    t2,      48
2404
2405    //32-39
2406    vldrepl.b      vr7,     a2,      31
2407    z1_upsample_edge_calc_other
2408    vst            vr13,    t2,      64
2409
2410    vst            vr13,    t2,      80  //40-47
2411    vst            vr13,    t2,      96  //48-55
2412    vst            vr13,    t2,      112 //56-63
2413    vst            vr13,    t2,      128 //64-71
2414
2415    //72-78
2416    fst.d          f13,     t2,     144
2417    vstelm.w       vr13,    t2,     152,    2
2418    vstelm.h       vr13,    t2,     156,    6
2419
2420    ld.bu          t7,      a2,     31
2421    st.b           t7,      t2,     158
2422    b              .Z1_UEDGE_END
2423
2424.Z1_UEDGE_W32:
2425    andi           t6,      a3,     32
2426    beqz           t6,      .Z1_UEDGE_W64
2427.Z1_UEDGE_W32_H8:
2428    andi           t6,      a4,     8
2429    beqz           t6,      .Z1_UEDGE_W32_H16
2430
2431    //0-7
2432    vld            vr7,     a2,      -1
2433    z1_upsample_edge_data_init1
2434    vst            vr13,    t2,     0
2435
2436    //8-15
2437    vld            vr7,     a2,      7
2438    z1_upsample_edge_data_init1
2439    vst            vr13,    t2,      16
2440
2441    //16-23
2442    vld            vr7,     a2,      15
2443    z1_upsample_edge_data_init1
2444    vst            vr13,    t2,      32
2445
2446    //24-31
2447    vld            vr7,     a2,      23
2448    z1_upsample_edge_data_init1
2449    vst            vr13,    t2,      48
2450
2451    //32-38
2452    vld            vr7,     a2,      31
2453    vbsrl.v        vr11,    vr7,     1
2454    vbsrl.v        vr12,    vr7,     2
2455    vextrins.b     vr12,    vr12,    0x76
2456    vbsrl.v        vr13,    vr7,     3
2457    z1_upsample_edge_calc_loop
2458    fst.d          f13,     t2,      64
2459    vstelm.w       vr13,    t2,      72,    2
2460    vstelm.h       vr13,    t2,      76,    6
2461
2462    ld.bu          t7,      a2,     39
2463    st.b           t7,      t2,     78
2464    b              .Z1_UEDGE_END
2465
2466.Z1_UEDGE_W32_H16:
2467    andi           t6,      a4,     16
2468    beqz           t6,      .Z1_UEDGE_W32_H32
2469
2470    //0-7
2471    vld            vr7,     a2,      -1
2472    z1_upsample_edge_data_init1
2473    vst            vr13,    t2,     0
2474
2475    //8-15
2476    vld            vr7,     a2,      7
2477    z1_upsample_edge_data_init1
2478    vst            vr13,    t2,      16
2479
2480    //16-23
2481    vld            vr7,     a2,      15
2482    z1_upsample_edge_data_init1
2483    vst            vr13,    t2,      32
2484
2485    //24-31
2486    vld            vr7,     a2,      23
2487    z1_upsample_edge_data_init1
2488    vst            vr13,    t2,      48
2489
2490    //32-39
2491    vld            vr7,     a2,      31
2492    z1_upsample_edge_data_init1
2493    vst            vr13,    t2,      64
2494
2495    //40-46
2496    vld            vr7,     a2,      39
2497    vbsrl.v        vr11,    vr7,     1
2498    vbsrl.v        vr12,    vr7,     2
2499    vextrins.b     vr12,    vr12,    0x76
2500    vbsrl.v        vr13,    vr7,     3
2501    z1_upsample_edge_calc_loop
2502    fst.d          f13,     t2,      80
2503    vstelm.w       vr13,    t2,      88,    2
2504    vstelm.h       vr13,    t2,      92,    6
2505
2506    ld.bu          t7,      a2,     47
2507    st.b           t7,      t2,     94
2508    b              .Z1_UEDGE_END
2509
2510.Z1_UEDGE_W32_H32:
2511    andi           t6,      a4,     32
2512    beqz           t6,      .Z1_UEDGE_W32_H64
2513
2514    //0-7
2515    vld            vr7,     a2,      -1
2516    z1_upsample_edge_data_init1
2517    vst            vr13,    t2,     0
2518
2519    //8-15
2520    vld            vr7,     a2,      7
2521    z1_upsample_edge_data_init1
2522    vst            vr13,    t2,      16
2523
2524    //16-23
2525    vld            vr7,     a2,      15
2526    z1_upsample_edge_data_init1
2527    vst            vr13,    t2,      32
2528
2529    //24-31
2530    vld            vr7,     a2,      23
2531    z1_upsample_edge_data_init1
2532    vst            vr13,    t2,      48
2533
2534    //32-39
2535    vld            vr7,     a2,      31
2536    z1_upsample_edge_data_init1
2537    vst            vr13,    t2,      64
2538
2539    //40-47
2540    vld            vr7,     a2,      39
2541    z1_upsample_edge_data_init1
2542    vst            vr13,    t2,      80
2543
2544    //48-55
2545    vld            vr7,     a2,      47
2546    z1_upsample_edge_data_init1
2547    vst            vr13,    t2,      96
2548
2549    //56-62
2550    vld            vr7,     a2,      55
2551    vbsrl.v        vr11,    vr7,     1
2552    vbsrl.v        vr12,    vr7,     2
2553    vextrins.b     vr12,    vr12,    0x76
2554    vbsrl.v        vr13,    vr7,     3
2555    z1_upsample_edge_calc_loop
2556    fst.d          f13,     t2,      112
2557    vstelm.w       vr13,    t2,      120,   2
2558    vstelm.h       vr13,    t2,      124,   6
2559
2560    ld.bu          t7,      a2,     63
2561    st.b           t7,      t2,     126
2562    b              .Z1_UEDGE_END
2563
2564.Z1_UEDGE_W32_H64:
2565    //0-7
2566    vld            vr7,     a2,      -1
2567    z1_upsample_edge_data_init1
2568    vst            vr13,    t2,     0
2569
2570    //8-15
2571    vld            vr7,     a2,      7
2572    z1_upsample_edge_data_init1
2573    vst            vr13,    t2,      16
2574
2575    //16-23
2576    vld            vr7,     a2,      15
2577    z1_upsample_edge_data_init1
2578    vst            vr13,    t2,      32
2579
2580    //24-31
2581    vld            vr7,     a2,      23
2582    z1_upsample_edge_data_init1
2583    vst            vr13,    t2,      48
2584
2585    //32-39
2586    vld            vr7,     a2,      31
2587    z1_upsample_edge_data_init1
2588    vst            vr13,    t2,      64
2589
2590    //40-47
2591    vld            vr7,     a2,      39
2592    z1_upsample_edge_data_init1
2593    vst            vr13,    t2,      80
2594
2595    //48-55
2596    vld            vr7,     a2,      47
2597    z1_upsample_edge_data_init1
2598    vst            vr13,    t2,      96
2599
2600    //56-63
2601    vld            vr7,     a2,      55
2602    z1_upsample_edge_data_init2
2603    vst            vr13,    t2,      112
2604
2605    //64-71
2606    vldrepl.b      vr7,     a2,      63
2607    z1_upsample_edge_calc_other
2608    vst            vr13,    t2,      128
2609
2610    vst            vr13,    t2,      144 //72-79
2611    vst            vr13,    t2,      160 //80-87
2612
2613    //88-94
2614    fst.d          f13,     t2,     176
2615    vstelm.w       vr13,    t2,     184,    2
2616    vstelm.h       vr13,    t2,     188,    6
2617
2618    ld.bu          t7,      a2,     63
2619    st.b           t7,      t2,     190
2620    b              .Z1_UEDGE_END
2621
2622.Z1_UEDGE_W64:
2623.Z1_UEDGE_W64_H16:
2624    andi           t6,      a4,     16
2625    beqz           t6,      .Z1_UEDGE_W64_H32
2626
2627    //0-7
2628    vld            vr7,     a2,      -1
2629    z1_upsample_edge_data_init1
2630    vst            vr13,    t2,     0
2631
2632    //8-15
2633    vld            vr7,     a2,      7
2634    z1_upsample_edge_data_init1
2635    vst            vr13,    t2,      16
2636
2637    //16-23
2638    vld            vr7,     a2,      15
2639    z1_upsample_edge_data_init1
2640    vst            vr13,    t2,      32
2641
2642    //24-31
2643    vld            vr7,     a2,      23
2644    z1_upsample_edge_data_init1
2645    vst            vr13,    t2,      48
2646
2647    //32-39
2648    vld            vr7,     a2,      31
2649    z1_upsample_edge_data_init1
2650    vst            vr13,    t2,      64
2651
2652    //40-47
2653    vld            vr7,     a2,      39
2654    z1_upsample_edge_data_init1
2655    vst            vr13,    t2,      80
2656
2657    //48-55
2658    vld            vr7,     a2,      47
2659    z1_upsample_edge_data_init1
2660    vst            vr13,    t2,      96
2661
2662    //56-63
2663    vld            vr7,     a2,      55
2664    z1_upsample_edge_data_init1
2665    vst            vr13,    t2,      112
2666
2667    //64-71
2668    vld            vr7,     a2,      63
2669    z1_upsample_edge_data_init1
2670    vst            vr13,    t2,      128
2671
2672    //72-78
2673    vld            vr7,     a2,      71
2674    z1_upsample_edge_data_init2
2675    fst.d          f13,     t2,     144
2676    vstelm.w       vr13,    t2,     152,    2
2677    vstelm.h       vr13,    t2,     156,    6
2678
2679    ld.bu          t7,      a2,     79
2680    st.b           t7,      t2,     158
2681    b              .Z1_UEDGE_END
2682
2683.Z1_UEDGE_W64_H32:
2684    andi           t6,      a4,     32
2685    beqz           t6,      .Z1_UEDGE_W64_H64
2686
2687    //0-7
2688    vld            vr7,     a2,      -1
2689    z1_upsample_edge_data_init1
2690    vst            vr13,    t2,     0
2691
2692    //8-15
2693    vld            vr7,     a2,      7
2694    z1_upsample_edge_data_init1
2695    vst            vr13,    t2,      16
2696
2697    //16-23
2698    vld            vr7,     a2,      15
2699    z1_upsample_edge_data_init1
2700    vst            vr13,    t2,      32
2701
2702    //24-31
2703    vld            vr7,     a2,      23
2704    z1_upsample_edge_data_init1
2705    vst            vr13,    t2,      48
2706
2707    //32-39
2708    vld            vr7,     a2,      31
2709    z1_upsample_edge_data_init1
2710    vst            vr13,    t2,      64
2711
2712    //40-47
2713    vld            vr7,     a2,      39
2714    z1_upsample_edge_data_init1
2715    vst            vr13,    t2,      80
2716
2717    //48-55
2718    vld            vr7,     a2,      47
2719    z1_upsample_edge_data_init1
2720    vst            vr13,    t2,      96
2721
2722    //56-63
2723    vld            vr7,     a2,      55
2724    z1_upsample_edge_data_init1
2725    vst            vr13,    t2,      112
2726
2727    //64-71
2728    vld            vr7,     a2,      63
2729    z1_upsample_edge_data_init1
2730    vst            vr13,    t2,      128
2731
2732    //72-79
2733    vld            vr7,     a2,      71
2734    z1_upsample_edge_data_init1
2735    vst            vr13,    t2,      144
2736
2737    //80-87
2738    vld            vr7,     a2,      79
2739    z1_upsample_edge_data_init1
2740    vst            vr13,    t2,      160
2741
2742    //88-94
2743    vld            vr7,     a2,      87
2744    z1_upsample_edge_data_init2
2745    fst.d          f13,     t2,     176
2746    vstelm.w       vr13,    t2,     184,    2
2747    vstelm.h       vr13,    t2,     188,    6
2748
2749    ld.bu          t7,      a2,     95
2750    st.b           t7,      t2,     190
2751    b              .Z1_UEDGE_END
2752
2753.Z1_UEDGE_W64_H64:
2754    //0-7
2755    vld            vr7,     a2,      -1
2756    z1_upsample_edge_data_init1
2757    vst            vr13,    t2,     0
2758
2759    //8-15
2760    vld            vr7,     a2,      7
2761    z1_upsample_edge_data_init1
2762    vst            vr13,    t2,      16
2763
2764    //16-23
2765    vld            vr7,     a2,      15
2766    z1_upsample_edge_data_init1
2767    vst            vr13,    t2,      32
2768
2769    //24-31
2770    vld            vr7,     a2,      23
2771    z1_upsample_edge_data_init1
2772    vst            vr13,    t2,      48
2773
2774    //32-39
2775    vld            vr7,     a2,      31
2776    z1_upsample_edge_data_init1
2777    vst            vr13,    t2,      64
2778
2779    //40-47
2780    vld            vr7,     a2,      39
2781    z1_upsample_edge_data_init1
2782    vst            vr13,    t2,      80
2783
2784    //48-55
2785    vld            vr7,     a2,      47
2786    z1_upsample_edge_data_init1
2787    vst            vr13,    t2,      96
2788
2789    //56-63
2790    vld            vr7,     a2,      55
2791    z1_upsample_edge_data_init1
2792    vst            vr13,    t2,      112
2793
2794    //64-71
2795    vld            vr7,     a2,      63
2796    z1_upsample_edge_data_init1
2797    vst            vr13,    t2,      128
2798
2799    //72-79
2800    vld            vr7,     a2,      71
2801    z1_upsample_edge_data_init1
2802    vst            vr13,    t2,      144
2803
2804    //80-87
2805    vld            vr7,     a2,      79
2806    z1_upsample_edge_data_init1
2807    vst            vr13,    t2,      160
2808
2809    //88-95
2810    vld            vr7,     a2,      87
2811    z1_upsample_edge_data_init1
2812    vst            vr13,    t2,      176
2813
2814    //96-103
2815    vld            vr7,     a2,      95
2816    z1_upsample_edge_data_init1
2817    vst            vr13,    t2,      192
2818
2819    //104-111
2820    vld            vr7,     a2,      103
2821    z1_upsample_edge_data_init1
2822    vst            vr13,    t2,      208
2823
2824    //112-119
2825    vld            vr7,     a2,      111
2826    z1_upsample_edge_data_init1
2827    vst            vr13,    t2,      224
2828
2829    //120-126
2830    vld            vr7,     a2,      119
2831    z1_upsample_edge_data_init2
2832    fst.d          f13,     t2,      240
2833    vstelm.w       vr13,    t2,      248,    2
2834    vstelm.h       vr13,    t2,      252,    6
2835
2836    ld.bu          t7,      a2,      127
2837    st.b           t7,      t2,      254
2838    b              .Z1_UEDGE_END
2839
2840.Z1_UEDGE_END:
2841    //upsample_edge end
2842
2843    or             a7,      t2,      t2   //top
2844    add.d          t0,      a3,      a4
2845    slli.d         t0,      t0,      1
2846    addi.d         t0,      t0,      -2   //max_base_x
2847    slli.d         t1,      t1,      1
2848    b              .IPRED_Z1_UA_END
2849
2850.IPRED_Z1_NOTUA:
2851    or             t5,      zero,    zero  //upsample_above=0
2852    beqz           a7,      .IPRED_Z1_NOTFS
2853    add.d          a7,      a3,      a4  //w+h
2854    li.w           t4,      90
2855    sub.d          t4,      t4,      a5
2856    // ipred_get_filter_strength a6:filter_strength
2857    beqz           a6,      .Z1_GETFS20
2858.Z1_GETFS10:  //wh<=8
2859    addi.d         t6,      a7,      -8
2860    blt            zero,    t6,      .Z1_GETFS11
2861    addi.d         t6,      t4,      -64
2862    blt            t6,      zero,    .Z1_GETFS101
2863    ori            a6,      zero,    2
2864    b              .Z1_GETFS40
2865.Z1_GETFS101:
2866    addi.d         t6,      t4,      -40
2867    blt            t6,      zero,    .Z1_GETFS30
2868    ori            a6,      zero,    1
2869    b              .Z1_GETFS40
2870.Z1_GETFS11:  //wh<=16
2871    addi.d         t6,      a7,      -16
2872    blt            zero,    t6,      .Z1_GETFS12
2873    addi.d         t6,      t4,      -48
2874    blt            t6,      zero,    .Z1_GETFS111
2875    ori            a6,      zero,    2
2876    b              .Z1_GETFS40
2877.Z1_GETFS111:
2878    addi.d         t6,      t4,      -20
2879    blt            t6,      zero,    .Z1_GETFS30
2880    ori            a6,      zero,    1
2881    b              .Z1_GETFS40
2882.Z1_GETFS12:  //wh<=24
2883    addi.d         t6,      a7,      -24
2884    blt            zero,    t6,      .Z1_GETFS13
2885    addi.d         t6,      t4,      -4
2886    blt            t6,      zero,    .Z1_GETFS30
2887    ori            a6,      zero,    3
2888    b              .Z1_GETFS40
2889.Z1_GETFS13:
2890    ori            a6,      zero,    3
2891    b              .Z1_GETFS40
2892
2893.Z1_GETFS20:  //wh<=8
2894    addi.d         t6,      a7,      -8
2895    blt            zero,    t6,      .Z1_GETFS21
2896    addi.d         t6,      t4,      -56
2897    blt            t6,      zero,    .Z1_GETFS30
2898    ori            a6,      zero,    1
2899    b              .Z1_GETFS40
2900.Z1_GETFS21:  //wh<=16
2901    addi.d         t6,      a7,      -16
2902    blt            zero,    t6,      .Z1_GETFS22
2903    addi.d         t6,      t4,      -40
2904    blt            t6,      zero,    .Z1_GETFS30
2905    ori            a6,      zero,    1
2906    b              .Z1_GETFS40
2907.Z1_GETFS22:  //wh<=24
2908    addi.d         t6,      a7,      -24
2909    blt            zero,    t6,      .Z1_GETFS23
2910    addi.d         t6,      t4,      -32
2911    blt            t6,      zero,    .Z1_GETFS221
2912    ori            a6,      zero,    3
2913    b              .Z1_GETFS40
2914.Z1_GETFS221:
2915    addi.d         t6,      t4,      -16
2916    blt            t6,      zero,    .Z1_GETFS222
2917    ori            a6,      zero,    2
2918    b              .Z1_GETFS40
2919.Z1_GETFS222:
2920    addi.d         t6,      t4,      -8
2921    blt            t6,      zero,    .Z1_GETFS30
2922    ori            a6,      zero,    1
2923    b              .Z1_GETFS40
2924.Z1_GETFS23:  //wh<=32
2925    addi.d         t6,      a7,      -32
2926    blt            zero,    t6,      .Z1_GETFS24
2927    addi.d         t6,      t4,      -32
2928    blt            t6,      zero,    .Z1_GETFS231
2929    ori            a6,      zero,    3
2930    b              .Z1_GETFS40
2931.Z1_GETFS231:
2932    addi.d         t6,      t4,      -4
2933    blt            t6,      zero,    .Z1_GETFS232
2934    ori            a6,      zero,    2
2935    b              .Z1_GETFS40
2936.Z1_GETFS232:
2937    ori            a6,      zero,    1
2938    b              .Z1_GETFS40
2939.Z1_GETFS24:
2940    ori            a6,      zero,    3
2941    b              .Z1_GETFS40
2942.Z1_GETFS30:
2943   or              a6,      zero,    zero
2944.Z1_GETFS40:
2945
2946    beqz           a6,      .IPRED_Z1_NOTFS
2947
2948.IPRED_Z1_IFFS:
2949    // filter_edge
2950    addi.d         a6,      a6,      -1
2951    slli.d         a6,      a6,      4
2952    la.local       t0,      ipred_filter_edge_kernel1
2953    vldx           vr1,     t0,      a6    //kernel[0-3]
2954
2955    la.local       t0,      ipred_filter_edge_kernel2
2956    vldx           vr6,     t0,      a6    //kernel[4]
2957
2958.IPRED_Z1_FS_W4:
2959    andi           t0,      a3,      4
2960    beqz           t0,      .IPRED_Z1_FS_W8
2961.IPRED_Z1_FS_W4_H4:
2962    andi           t0,      a4,      4
2963    beqz           t0,      .IPRED_Z1_FS_W4_H8
2964
2965    //0-7
2966    vld            vr7,     a2,      -1
2967    z1_filter_edge_data_init4
2968    vbsrl.v        vr13,    vr7,     3
2969    vextrins.b     vr13,    vr13,    0x65
2970    vextrins.b     vr13,    vr13,    0x75
2971    z1_filter_edge_calc_loop2
2972    fst.d          f12,     t2,      0
2973    b              .IPRED_Z1_FS_END
2974
2975.IPRED_Z1_FS_W4_H8:
2976    andi           t0,      a4,      8
2977    beqz           t0,      .IPRED_Z1_FS_W4_H16
2978
2979    //0-7
2980    vld            vr7,     a2,      -1
2981    z1_filter_edge_data_init4
2982    vbsrl.v        vr13,    vr7,     3
2983    vextrins.b     vr13,    vr13,    0x65
2984    vextrins.b     vr13,    vr13,    0x75
2985    z1_filter_edge_calc_loop2
2986    fst.d          f12,     t2,      0
2987
2988    //8-11
2989    vreplvei.b     vr10,    vr7,     8
2990    vextrins.b     vr10,    vr7,     0x07
2991    z1_filter_edge_calc_other
2992    fst.s          f12,     t2,      8
2993
2994    b              .IPRED_Z1_FS_END
2995
2996.IPRED_Z1_FS_W4_H16:
2997    andi           t0,      a4,      16
2998    beqz           t0,      .IPRED_Z1_FS_W4_H32
2999
3000    //0-7
3001    vld            vr7,     a2,      -1
3002    z1_filter_edge_data_init4
3003    vbsrl.v        vr13,    vr7,     3
3004    vextrins.b     vr13,    vr13,    0x65
3005    vextrins.b     vr13,    vr13,    0x75
3006    z1_filter_edge_calc_loop2
3007    fst.d          f12,     t2,      0
3008
3009    //8-15
3010    vreplvei.b     vr10,    vr7,     8
3011    vextrins.b     vr10,    vr7,     0x07
3012    z1_filter_edge_calc_other
3013    fst.d          f12,     t2,      8
3014
3015    //16-19
3016    vreplvei.b     vr12,    vr12,    1
3017    fst.s          f12,     t2,      16
3018
3019    b              .IPRED_Z1_FS_END
3020
3021.IPRED_Z1_FS_W4_H32:
3022    andi           t0,      a4,      32
3023    beqz           t0,      .IPRED_Z1_FS_W4_H64
3024
3025    //0-7
3026    vld            vr7,     a2,      -1
3027    z1_filter_edge_data_init4
3028    vbsrl.v        vr13,    vr7,     3
3029    vextrins.b     vr13,    vr13,    0x65
3030    vextrins.b     vr13,    vr13,    0x75
3031    z1_filter_edge_calc_loop2
3032    fst.d          f12,     t2,      0
3033
3034    //8-15
3035    vreplvei.b     vr10,    vr7,     8
3036    vextrins.b     vr10,    vr7,     0x07
3037    z1_filter_edge_calc_other
3038    fst.d          f12,     t2,      8
3039
3040    //16-23
3041    vreplvei.b     vr12,    vr12,    1
3042    fst.d          f12,     t2,      16
3043
3044    fst.d          f12,     t2,      24 //24-31
3045    fst.s          f12,     t2,      32 //32-35
3046
3047    b              .IPRED_Z1_FS_END
3048
3049.IPRED_Z1_FS_W4_H64:
3050    //0-7
3051    vld            vr7,     a2,      -1
3052    z1_filter_edge_data_init4
3053    vbsrl.v        vr13,    vr7,     3
3054    vextrins.b     vr13,    vr13,    0x65
3055    vextrins.b     vr13,    vr13,    0x75
3056    z1_filter_edge_calc_loop2
3057    fst.d          f12,     t2,      0
3058
3059    //8-15
3060    vreplvei.b     vr10,    vr7,     8
3061    vextrins.b     vr10,    vr7,     0x07
3062    z1_filter_edge_calc_other
3063    fst.d          f12,     t2,      8
3064
3065    //16-23
3066    vreplvei.b     vr12,    vr12,    1
3067    fst.d          f12,     t2,      16
3068
3069    fst.d          f12,     t2,      24 //24-31
3070    fst.d          f12,     t2,      32 //32-39
3071    fst.d          f12,     t2,      40 //40-47
3072    fst.d          f12,     t2,      48 //48-55
3073    fst.d          f12,     t2,      56 //56-63
3074    fst.s          f12,     t2,      64 //64-67
3075
3076    b              .IPRED_Z1_FS_END
3077
3078.IPRED_Z1_FS_W8:
3079    andi           t0,      a3,      8
3080    beqz           t0,      .IPRED_Z1_FS_W16
3081.IPRED_Z1_FS_W8_H4:
3082    andi           t0,      a4,      4
3083    beqz           t0,      .IPRED_Z1_FS_W8_H8
3084
3085    //0-7
3086    vld            vr7,     a2,      -1
3087    z1_filter_edge_data_init1
3088    vbsrl.v        vr13,    vr7,     3
3089    z1_filter_edge_calc_loop2
3090    fst.d          f12,     t2,      0
3091
3092    //8-11
3093    vld            vr7,     a2,      6
3094    vbsrl.v        vr11,    vr7,     1
3095    vbsrl.v        vr12,    vr7,     2
3096    vbsrl.v        vr13,    vr7,     3
3097    vextrins.b     vr13,    vr13,    0x32
3098    vsllwil.hu.bu  vr10,    vr7,     0
3099    vsllwil.hu.bu  vr11,    vr11,    0
3100    vsllwil.hu.bu  vr12,    vr12,    0
3101    vsllwil.hu.bu  vr13,    vr13,    0
3102    z1_filter_edge_calc_loop1
3103
3104    vbsrl.v        vr13,    vr7,     4
3105    vextrins.b     vr13,    vr13,    0x21
3106    vextrins.b     vr13,    vr13,    0x31
3107    z1_filter_edge_calc_loop2
3108    fst.s          f12,     t2,      8
3109    b              .IPRED_Z1_FS_END
3110
3111.IPRED_Z1_FS_W8_H8:
3112    andi           t0,      a4,      8
3113    beqz           t0,      .IPRED_Z1_FS_W8_H16
3114
3115    //0-7
3116    vld            vr7,     a2,      -1
3117    z1_filter_edge_data_init1
3118    vbsrl.v        vr13,    vr7,     3
3119    z1_filter_edge_calc_loop2
3120    fst.d          f12,     t2,      0
3121
3122    //8-15
3123    vld            vr7,     a2,      6
3124    z1_filter_edge_data_init3
3125    vbsrl.v        vr13,    vr7,     4
3126    vextrins.b     vr13,    vr13,    0x65
3127    vextrins.b     vr13,    vr13,    0x75
3128    z1_filter_edge_calc_loop2
3129    fst.d          f12,     t2,      8
3130    b              .IPRED_Z1_FS_END
3131
3132.IPRED_Z1_FS_W8_H16:
3133    andi           t0,      a4,      16
3134    beqz           t0,      .IPRED_Z1_FS_W8_H32
3135
3136    //0-7
3137    vld            vr7,     a2,      -1
3138    z1_filter_edge_data_init1
3139    vbsrl.v        vr13,    vr7,     3
3140    z1_filter_edge_calc_loop2
3141    fst.d          f12,     t2,      0
3142
3143    //8-15
3144    vld            vr7,     a2,      6
3145    z1_filter_edge_data_init3
3146    vbsrl.v        vr13,    vr7,     4
3147    vextrins.b     vr13,    vr13,    0x65
3148    vextrins.b     vr13,    vr13,    0x75
3149    z1_filter_edge_calc_loop2
3150    fst.d          f12,     t2,      8
3151
3152    //16-23
3153    vreplvei.b     vr10,    vr7,     9
3154    vextrins.b     vr10,    vr7,     0x08
3155    z1_filter_edge_calc_other
3156    fst.d          f12,     t2,      16
3157
3158    b              .IPRED_Z1_FS_END
3159
3160.IPRED_Z1_FS_W8_H32:
3161    andi           t0,      a4,      32
3162    beqz           t0,      .IPRED_Z1_FS_W8_H64
3163
3164    //0-7
3165    vld            vr7,     a2,      -1
3166    z1_filter_edge_data_init1
3167    vbsrl.v        vr13,    vr7,     3
3168    z1_filter_edge_calc_loop2
3169    fst.d          f12,     t2,      0
3170
3171    //8-15
3172    vld            vr7,     a2,      6
3173    z1_filter_edge_data_init3
3174    vbsrl.v        vr13,    vr7,     4
3175    vextrins.b     vr13,    vr13,    0x65
3176    vextrins.b     vr13,    vr13,    0x75
3177    z1_filter_edge_calc_loop2
3178    fst.d          f12,     t2,      8
3179
3180    //16-23
3181    vreplvei.b     vr10,    vr7,     9
3182    vextrins.b     vr10,    vr7,     0x08
3183    z1_filter_edge_calc_other
3184    fst.d          f12,     t2,      16
3185
3186    //24-31
3187    vreplvei.b     vr12,    vr12,    1
3188    fst.d          f12,     t2,      24
3189
3190    //32-39
3191    fst.d          f12,     t2,      32
3192
3193    b              .IPRED_Z1_FS_END
3194
3195.IPRED_Z1_FS_W8_H64:
3196    //0-7
3197    vld            vr7,     a2,      -1
3198    z1_filter_edge_data_init1
3199    vbsrl.v        vr13,    vr7,     3
3200    z1_filter_edge_calc_loop2
3201    fst.d          f12,     t2,      0
3202
3203    //8-15
3204    vld            vr7,     a2,      6
3205    z1_filter_edge_data_init3
3206    vbsrl.v        vr13,    vr7,     4
3207    vextrins.b     vr13,    vr13,    0x65
3208    vextrins.b     vr13,    vr13,    0x75
3209    z1_filter_edge_calc_loop2
3210    fst.d          f12,     t2,      8
3211
3212    //16-23
3213    vreplvei.b     vr10,    vr7,     9
3214    vextrins.b     vr10,    vr7,     0x08
3215    z1_filter_edge_calc_other
3216    fst.d          f12,     t2,      16
3217
3218    //24-31
3219    vreplvei.b     vr12,    vr12,    1
3220    fst.d          f12,     t2,      24
3221
3222    fst.d          f12,     t2,      32  //32-39
3223    fst.d          f12,     t2,      40  //40-47
3224    fst.d          f12,     t2,      48  //48-55
3225    fst.d          f12,     t2,      56  //56-63
3226    fst.d          f12,     t2,      64  //64-71
3227
3228    b              .IPRED_Z1_FS_END
3229
3230.IPRED_Z1_FS_W16:
3231    andi           t0,      a3,      16
3232    beqz           t0,      .IPRED_Z1_FS_W32
3233.IPRED_Z1_FS_W16_H4:
3234    andi           t0,      a4,      4
3235    beqz           t0,      .IPRED_Z1_FS_W16_H8
3236
3237    //0-7
3238    vld            vr7,     a2,      -1
3239    z1_filter_edge_data_init1
3240    vbsrl.v        vr13,    vr7,     3
3241    z1_filter_edge_calc_loop2
3242    fst.d          f12,     t2,      0
3243
3244    //8-15
3245    vld            vr7,     a2,      6
3246    z1_filter_edge_data_init2
3247    vbsrl.v        vr13,    vr7,     4
3248    z1_filter_edge_calc_loop2
3249    fst.d          f12,     t2,      8
3250
3251    //16-19
3252    vld            vr7,     a2,      14
3253    vbsrl.v        vr11,    vr7,     1
3254    vbsrl.v        vr12,    vr7,     2
3255    vbsrl.v        vr13,    vr7,     3
3256    vextrins.b     vr13,    vr13,    0x32
3257    vsllwil.hu.bu  vr10,    vr7,     0
3258    vsllwil.hu.bu  vr11,    vr11,    0
3259    vsllwil.hu.bu  vr12,    vr12,    0
3260    vsllwil.hu.bu  vr13,    vr13,    0
3261    z1_filter_edge_calc_loop1
3262
3263    vbsrl.v        vr13,    vr7,     4
3264    vextrins.b     vr13,    vr13,    0x21
3265    vextrins.b     vr13,    vr13,    0x31
3266    z1_filter_edge_calc_loop2
3267    fst.s          f12,     t2,      16
3268    b              .IPRED_Z1_FS_END
3269
3270.IPRED_Z1_FS_W16_H8:
3271    andi           t0,      a4,      8
3272    beqz           t0,      .IPRED_Z1_FS_W16_H16
3273
3274    //0-7
3275    vld            vr7,     a2,      -1
3276    z1_filter_edge_data_init1
3277    vbsrl.v        vr13,    vr7,     3
3278    z1_filter_edge_calc_loop2
3279    fst.d          f12,     t2,      0
3280
3281    //8-15
3282    vld            vr7,     a2,      6
3283    z1_filter_edge_data_init2
3284    vbsrl.v        vr13,    vr7,     4
3285    z1_filter_edge_calc_loop2
3286    fst.d          f12,     t2,      8
3287
3288    //16-23
3289    vld            vr7,     a2,      14
3290    z1_filter_edge_data_init3
3291    vbsrl.v        vr13,    vr7,     4
3292    vextrins.b     vr13,    vr13,    0x65
3293    vextrins.b     vr13,    vr13,    0x75
3294    z1_filter_edge_calc_loop2
3295    fst.d          f12,     t2,      16
3296    b              .IPRED_Z1_FS_END
3297
3298.IPRED_Z1_FS_W16_H16:
3299    andi           t0,      a4,      16
3300    beqz           t0,      .IPRED_Z1_FS_W16_H32
3301
3302    //0-7
3303    vld            vr7,     a2,      -1
3304    z1_filter_edge_data_init1
3305    vbsrl.v        vr13,    vr7,     3
3306    z1_filter_edge_calc_loop2
3307    fst.d          f12,     t2,      0
3308
3309    //8-15
3310    vld            vr7,     a2,      6
3311    z1_filter_edge_data_init2
3312    vbsrl.v        vr13,    vr7,     4
3313    z1_filter_edge_calc_loop2
3314    fst.d          f12,     t2,      8
3315
3316    //16-23
3317    vld            vr7,     a2,      14
3318    z1_filter_edge_data_init2
3319    vbsrl.v        vr13,    vr7,     4
3320    z1_filter_edge_calc_loop2
3321    fst.d          f12,     t2,      16
3322
3323    //24-31
3324    vld            vr7,     a2,      22
3325    z1_filter_edge_data_init3
3326    vbsrl.v        vr13,    vr7,     4
3327    vextrins.b     vr13,    vr13,    0x65
3328    vextrins.b     vr13,    vr13,    0x75
3329    z1_filter_edge_calc_loop2
3330    fst.d          f12,     t2,      24
3331    b              .IPRED_Z1_FS_END
3332
3333.IPRED_Z1_FS_W16_H32:
3334    andi           t0,      a4,      32
3335    beqz           t0,      .IPRED_Z1_FS_W16_H64
3336
3337    //0-7
3338    vld            vr7,     a2,      -1
3339    z1_filter_edge_data_init1
3340    vbsrl.v        vr13,    vr7,     3
3341    z1_filter_edge_calc_loop2
3342    fst.d          f12,     t2,      0
3343
3344    //8-15
3345    vld            vr7,     a2,      6
3346    z1_filter_edge_data_init2
3347    vbsrl.v        vr13,    vr7,     4
3348    z1_filter_edge_calc_loop2
3349    fst.d          f12,     t2,      8
3350
3351    //16-23
3352    vld            vr7,     a2,      14
3353    z1_filter_edge_data_init2
3354    vbsrl.v        vr13,    vr7,     4
3355    z1_filter_edge_calc_loop2
3356    fst.d          f12,     t2,      16
3357
3358    //24-31
3359    vld            vr7,     a2,      22
3360    z1_filter_edge_data_init3
3361    vbsrl.v        vr13,    vr7,     4
3362    vextrins.b     vr13,    vr13,    0x65
3363    vextrins.b     vr13,    vr13,    0x75
3364    z1_filter_edge_calc_loop2
3365    fst.d          f12,     t2,      24
3366
3367    //32-39
3368    vreplvei.b     vr10,    vr7,     9
3369    vextrins.b     vr10,    vr7,     0x08
3370    z1_filter_edge_calc_other
3371    fst.d          f12,     t2,      32
3372
3373    //40-47
3374    vreplvei.b     vr12,    vr12,    1
3375    fst.d          f12,     t2,      40
3376
3377    b              .IPRED_Z1_FS_END
3378
3379.IPRED_Z1_FS_W16_H64:
3380    //0-7
3381    vld            vr7,     a2,      -1
3382    z1_filter_edge_data_init1
3383    vbsrl.v        vr13,    vr7,     3
3384    z1_filter_edge_calc_loop2
3385    fst.d          f12,     t2,      0
3386
3387    //8-15
3388    vld            vr7,     a2,      6
3389    z1_filter_edge_data_init2
3390    vbsrl.v        vr13,    vr7,     4
3391    z1_filter_edge_calc_loop2
3392    fst.d          f12,     t2,      8
3393
3394    //16-23
3395    vld            vr7,     a2,      14
3396    z1_filter_edge_data_init2
3397    vbsrl.v        vr13,    vr7,     4
3398    z1_filter_edge_calc_loop2
3399    fst.d          f12,     t2,      16
3400
3401    //24-31
3402    vld            vr7,     a2,      22
3403    z1_filter_edge_data_init3
3404    vbsrl.v        vr13,    vr7,     4
3405    vextrins.b     vr13,    vr13,    0x65
3406    vextrins.b     vr13,    vr13,    0x75
3407    z1_filter_edge_calc_loop2
3408    fst.d          f12,     t2,      24
3409
3410    //32-39
3411    vreplvei.b     vr10,    vr7,     9
3412    vextrins.b     vr10,    vr7,     0x08
3413    z1_filter_edge_calc_other
3414    fst.d          f12,     t2,      32
3415
3416    //40-47
3417    vreplvei.b     vr12,    vr12,    1
3418    fst.d          f12,     t2,      40
3419
3420    fst.d          f12,     t2,      48 //48-55
3421    fst.d          f12,     t2,      56 //56-63
3422    fst.d          f12,     t2,      64 //64-71
3423    fst.d          f12,     t2,      72 //72-81
3424
3425    b              .IPRED_Z1_FS_END
3426
3427.IPRED_Z1_FS_W32:
3428    andi           t0,      a3,      32
3429    beqz           t0,      .IPRED_Z1_FS_W64
3430.IPRED_Z1_FS_W32_H8:
3431    andi           t0,      a4,      8
3432    beqz           t0,      .IPRED_Z1_FS_W32_H16
3433
3434    //0-7
3435    vld            vr7,     a2,      -1
3436    z1_filter_edge_data_init1
3437    vbsrl.v        vr13,    vr7,     3
3438    z1_filter_edge_calc_loop2
3439    fst.d          f12,     t2,      0
3440
3441    //8-15
3442    vld            vr7,     a2,      6
3443    z1_filter_edge_data_init2
3444    vbsrl.v        vr13,    vr7,     4
3445    z1_filter_edge_calc_loop2
3446    fst.d          f12,     t2,      8
3447
3448    //16-23
3449    vld            vr7,     a2,      14
3450    z1_filter_edge_data_init2
3451    vbsrl.v        vr13,    vr7,     4
3452    z1_filter_edge_calc_loop2
3453    fst.d          f12,     t2,      16
3454
3455    //24-31
3456    vld            vr7,     a2,      22
3457    z1_filter_edge_data_init2
3458    vbsrl.v        vr13,    vr7,     4
3459    z1_filter_edge_calc_loop2
3460    fst.d          f12,     t2,      24
3461
3462    //32-39
3463    vld            vr7,     a2,      30
3464    z1_filter_edge_data_init3
3465    vbsrl.v        vr13,    vr7,     4
3466    vextrins.b     vr13,    vr13,    0x65
3467    vextrins.b     vr13,    vr13,    0x75
3468    z1_filter_edge_calc_loop2
3469    fst.d          f12,     t2,      32
3470
3471    b              .IPRED_Z1_FS_END
3472
3473.IPRED_Z1_FS_W32_H16:
3474    andi           t0,      a4,      16
3475    beqz           t0,      .IPRED_Z1_FS_W32_H32
3476
3477    //0-7
3478    vld            vr7,     a2,      -1
3479    z1_filter_edge_data_init1
3480    vbsrl.v        vr13,    vr7,     3
3481    z1_filter_edge_calc_loop2
3482    fst.d          f12,     t2,      0
3483
3484    //8-15
3485    vld            vr7,     a2,      6
3486    z1_filter_edge_data_init2
3487    vbsrl.v        vr13,    vr7,     4
3488    z1_filter_edge_calc_loop2
3489    fst.d          f12,     t2,      8
3490
3491    //16-23
3492    vld            vr7,     a2,      14
3493    z1_filter_edge_data_init2
3494
3495    vbsrl.v        vr13,    vr7,     4
3496    z1_filter_edge_calc_loop2
3497    fst.d          f12,     t2,      16
3498
3499    //24-31
3500    vld            vr7,     a2,      22
3501    z1_filter_edge_data_init2
3502    vbsrl.v        vr13,    vr7,     4
3503    z1_filter_edge_calc_loop2
3504    fst.d          f12,     t2,      24
3505
3506    //32-39
3507    vld            vr7,     a2,      30
3508    z1_filter_edge_data_init2
3509    vbsrl.v        vr13,    vr7,     4
3510    z1_filter_edge_calc_loop2
3511    fst.d          f12,     t2,      32
3512
3513    //40-47
3514    vld            vr7,     a2,      38
3515    z1_filter_edge_data_init3
3516    vbsrl.v        vr13,    vr7,     4
3517    vextrins.b     vr13,    vr13,    0x65
3518    vextrins.b     vr13,    vr13,    0x75
3519    z1_filter_edge_calc_loop2
3520    fst.d          f12,     t2,      40
3521
3522    b              .IPRED_Z1_FS_END
3523
3524.IPRED_Z1_FS_W32_H32:
3525    andi           t0,      a4,      32
3526    beqz           t0,      .IPRED_Z1_FS_W32_H64
3527
3528    //0-7
3529    vld            vr7,     a2,      -1
3530    z1_filter_edge_data_init1
3531    vbsrl.v        vr13,    vr7,     3
3532    z1_filter_edge_calc_loop2
3533    fst.d          f12,     t2,      0
3534
3535    //8-15
3536    vld            vr7,     a2,      6
3537    z1_filter_edge_data_init2
3538    vbsrl.v        vr13,    vr7,     4
3539    z1_filter_edge_calc_loop2
3540    fst.d          f12,     t2,      8
3541
3542    //16-23
3543    vld            vr7,     a2,      14
3544    z1_filter_edge_data_init2
3545    vbsrl.v        vr13,    vr7,     4
3546    z1_filter_edge_calc_loop2
3547    fst.d          f12,     t2,      16
3548
3549    //24-31
3550    vld            vr7,     a2,      22
3551    z1_filter_edge_data_init2
3552    vbsrl.v        vr13,    vr7,     4
3553    z1_filter_edge_calc_loop2
3554    fst.d          f12,     t2,      24
3555
3556    //32-39
3557    vld            vr7,     a2,      30
3558    z1_filter_edge_data_init2
3559    vbsrl.v        vr13,    vr7,     4
3560    z1_filter_edge_calc_loop2
3561    fst.d          f12,     t2,      32
3562
3563    //40-47
3564    vld            vr7,     a2,      38
3565    z1_filter_edge_data_init2
3566    vbsrl.v        vr13,    vr7,     4
3567    z1_filter_edge_calc_loop2
3568    fst.d          f12,     t2,      40
3569
3570    //48-55
3571    vld            vr7,     a2,      46
3572    z1_filter_edge_data_init2
3573    vbsrl.v        vr13,    vr7,     4
3574    z1_filter_edge_calc_loop2
3575    fst.d          f12,     t2,      48
3576
3577    //56-63
3578    vld            vr7,     a2,      54
3579    z1_filter_edge_data_init3
3580    vbsrl.v        vr13,    vr7,     4
3581    vextrins.b     vr13,    vr13,    0x65
3582    vextrins.b     vr13,    vr13,    0x75
3583    z1_filter_edge_calc_loop2
3584    fst.d          f12,     t2,      56
3585
3586    b              .IPRED_Z1_FS_END
3587
3588.IPRED_Z1_FS_W32_H64:
3589    //0-7
3590    vld            vr7,     a2,      -1
3591    z1_filter_edge_data_init1
3592    vbsrl.v        vr13,    vr7,     3
3593    z1_filter_edge_calc_loop2
3594    fst.d          f12,     t2,      0
3595
3596    //8-15
3597    vld            vr7,     a2,      6
3598    z1_filter_edge_data_init2
3599    vbsrl.v        vr13,    vr7,     4
3600    z1_filter_edge_calc_loop2
3601    fst.d          f12,     t2,      8
3602
3603    //16-23
3604    vld            vr7,     a2,      14
3605    z1_filter_edge_data_init2
3606    vbsrl.v        vr13,    vr7,     4
3607    z1_filter_edge_calc_loop2
3608    fst.d          f12,     t2,      16
3609
3610    //24-31
3611    vld            vr7,     a2,      22
3612    z1_filter_edge_data_init2
3613    vbsrl.v        vr13,    vr7,     4
3614    z1_filter_edge_calc_loop2
3615    fst.d          f12,     t2,      24
3616
3617    //32-39
3618    vld            vr7,     a2,      30
3619    z1_filter_edge_data_init2
3620    vbsrl.v        vr13,    vr7,     4
3621    z1_filter_edge_calc_loop2
3622    fst.d          f12,     t2,      32
3623
3624    //40-47
3625    vld            vr7,     a2,      38
3626    z1_filter_edge_data_init2
3627    vbsrl.v        vr13,    vr7,     4
3628    z1_filter_edge_calc_loop2
3629    fst.d          f12,     t2,      40
3630
3631    //48-55
3632    vld            vr7,     a2,      46
3633    z1_filter_edge_data_init2
3634    vbsrl.v        vr13,    vr7,     4
3635    z1_filter_edge_calc_loop2
3636    fst.d          f12,     t2,      48
3637
3638    //56-63
3639    vld            vr7,     a2,      54
3640    z1_filter_edge_data_init3
3641    vbsrl.v        vr13,    vr7,     4
3642    vextrins.b     vr13,    vr13,    0x65
3643    vextrins.b     vr13,    vr13,    0x75
3644    z1_filter_edge_calc_loop2
3645    fst.d          f12,     t2,      56
3646
3647    //64-71
3648    vreplvei.b     vr10,    vr7,     9
3649    vextrins.b     vr10,    vr7,     0x08
3650    z1_filter_edge_calc_other
3651    fst.d          f12,     t2,      64
3652
3653    //72-89
3654    vreplvei.b     vr12,    vr12,    1
3655    fst.d          f12,     t2,      72
3656
3657    fst.d          f12,     t2,      80 //80-87
3658    fst.d          f12,     t2,      88 //88-95
3659
3660    b              .IPRED_Z1_FS_END
3661
3662.IPRED_Z1_FS_W64:
3663.IPRED_Z1_FS_W64_H16:
3664    andi           t0,      a4,      16
3665    beqz           t0,      .IPRED_Z1_FS_W64_H32
3666
3667    //0-7
3668    vld            vr7,     a2,      -1
3669    z1_filter_edge_data_init1
3670    vbsrl.v        vr13,    vr7,     3
3671    z1_filter_edge_calc_loop2
3672    fst.d          f12,     t2,      0
3673
3674    //8-15
3675    vld            vr7,     a2,      6
3676    z1_filter_edge_data_init2
3677    vbsrl.v        vr13,    vr7,     4
3678    z1_filter_edge_calc_loop2
3679    fst.d          f12,     t2,      8
3680
3681    //16-23
3682    vld            vr7,     a2,      14
3683    z1_filter_edge_data_init2
3684    vbsrl.v        vr13,    vr7,     4
3685    z1_filter_edge_calc_loop2
3686    fst.d          f12,     t2,      16
3687
3688    //24-31
3689    vld            vr7,     a2,      22
3690    z1_filter_edge_data_init2
3691    vbsrl.v        vr13,    vr7,     4
3692    z1_filter_edge_calc_loop2
3693    fst.d          f12,     t2,      24
3694
3695    //32-39
3696    vld            vr7,     a2,      30
3697    z1_filter_edge_data_init2
3698    vbsrl.v        vr13,    vr7,     4
3699    z1_filter_edge_calc_loop2
3700    fst.d          f12,     t2,      32
3701
3702    //40-47
3703    vld            vr7,     a2,      38
3704    z1_filter_edge_data_init2
3705    vbsrl.v        vr13,    vr7,     4
3706    z1_filter_edge_calc_loop2
3707    fst.d          f12,     t2,      40
3708
3709    //48-55
3710    vld            vr7,     a2,      46
3711    z1_filter_edge_data_init2
3712    vbsrl.v        vr13,    vr7,     4
3713    z1_filter_edge_calc_loop2
3714    fst.d          f12,     t2,      48
3715
3716    //56-63
3717    vld            vr7,     a2,      54
3718    z1_filter_edge_data_init2
3719    vbsrl.v        vr13,    vr7,     4
3720    z1_filter_edge_calc_loop2
3721    fst.d          f12,     t2,      56
3722
3723    //64-71
3724    vld            vr7,     a2,      62
3725    z1_filter_edge_data_init2
3726    vbsrl.v        vr13,    vr7,     4
3727    z1_filter_edge_calc_loop2
3728    fst.d          f12,     t2,      64
3729
3730    //72-79
3731    vld            vr7,     a2,      70
3732    z1_filter_edge_data_init3
3733    vbsrl.v        vr13,    vr7,     4
3734    vextrins.b     vr13,    vr13,    0x65
3735    vextrins.b     vr13,    vr13,    0x75
3736    z1_filter_edge_calc_loop2
3737    fst.d          f12,     t2,      72
3738
3739    b              .IPRED_Z1_FS_END
3740
3741.IPRED_Z1_FS_W64_H32:
3742    andi           t0,      a4,      32
3743    beqz           t0,      .IPRED_Z1_FS_W64_H64
3744
3745    //0-7
3746    vld            vr7,     a2,      -1
3747    z1_filter_edge_data_init1
3748    vbsrl.v        vr13,    vr7,     3
3749    z1_filter_edge_calc_loop2
3750    fst.d          f12,     t2,      0
3751
3752    //8-15
3753    vld            vr7,     a2,      6
3754    z1_filter_edge_data_init2
3755    vbsrl.v        vr13,    vr7,     4
3756    z1_filter_edge_calc_loop2
3757    fst.d          f12,     t2,      8
3758
3759    //16-23
3760    vld            vr7,     a2,      14
3761    z1_filter_edge_data_init2
3762    vbsrl.v        vr13,    vr7,     4
3763    z1_filter_edge_calc_loop2
3764    fst.d          f12,     t2,      16
3765
3766    //24-31
3767    vld            vr7,     a2,      22
3768    z1_filter_edge_data_init2
3769    vbsrl.v        vr13,    vr7,     4
3770    z1_filter_edge_calc_loop2
3771    fst.d          f12,     t2,      24
3772
3773    //32-39
3774    vld            vr7,     a2,      30
3775    z1_filter_edge_data_init2
3776    vbsrl.v        vr13,    vr7,     4
3777    z1_filter_edge_calc_loop2
3778    fst.d          f12,     t2,      32
3779
3780    //40-47
3781    vld            vr7,     a2,      38
3782    z1_filter_edge_data_init2
3783    vbsrl.v        vr13,    vr7,     4
3784    z1_filter_edge_calc_loop2
3785    fst.d          f12,     t2,      40
3786
3787    //48-55
3788    vld            vr7,     a2,      46
3789    z1_filter_edge_data_init2
3790    vbsrl.v        vr13,    vr7,     4
3791    z1_filter_edge_calc_loop2
3792    fst.d          f12,     t2,      48
3793
3794    //56-63
3795    vld            vr7,     a2,      54
3796    z1_filter_edge_data_init2
3797    vbsrl.v        vr13,    vr7,     4
3798    z1_filter_edge_calc_loop2
3799    fst.d          f12,     t2,      56
3800
3801    //64-71
3802    vld            vr7,     a2,      62
3803    z1_filter_edge_data_init2
3804    vbsrl.v        vr13,    vr7,     4
3805    z1_filter_edge_calc_loop2
3806    fst.d          f12,     t2,      64
3807
3808    //72-79
3809    vld            vr7,     a2,      70
3810    z1_filter_edge_data_init2
3811    vbsrl.v        vr13,    vr7,     4
3812    z1_filter_edge_calc_loop2
3813    fst.d          f12,     t2,      72
3814
3815    //80-87
3816    vld            vr7,     a2,      78
3817    z1_filter_edge_data_init2
3818    vbsrl.v        vr13,    vr7,     4
3819    z1_filter_edge_calc_loop2
3820    fst.d          f12,     t2,      80
3821
3822    //88-95
3823    vld            vr7,     a2,      86
3824    z1_filter_edge_data_init3
3825    vbsrl.v        vr13,    vr7,     4
3826    vextrins.b     vr13,    vr13,    0x65
3827    vextrins.b     vr13,    vr13,    0x75
3828    z1_filter_edge_calc_loop2
3829    fst.d          f12,     t2,      88
3830
3831    b              .IPRED_Z1_FS_END
3832
3833.IPRED_Z1_FS_W64_H64:
3834    //0-7
3835    vld            vr7,     a2,      -1
3836    z1_filter_edge_data_init1
3837    vbsrl.v        vr13,    vr7,     3
3838    z1_filter_edge_calc_loop2
3839    fst.d          f12,     t2,      0
3840
3841    //8-15
3842    vld            vr7,     a2,      6
3843    z1_filter_edge_data_init2
3844    vbsrl.v        vr13,    vr7,     4
3845    z1_filter_edge_calc_loop2
3846    fst.d          f12,     t2,      8
3847
3848    //16-23
3849    vld            vr7,     a2,      14
3850    z1_filter_edge_data_init2
3851    vbsrl.v        vr13,    vr7,     4
3852    z1_filter_edge_calc_loop2
3853    fst.d          f12,     t2,      16
3854
3855    //24-31
3856    vld            vr7,     a2,      22
3857    z1_filter_edge_data_init2
3858    vbsrl.v        vr13,    vr7,     4
3859    z1_filter_edge_calc_loop2
3860    fst.d          f12,     t2,      24
3861
3862    //32-39
3863    vld            vr7,     a2,      30
3864    z1_filter_edge_data_init2
3865    vbsrl.v        vr13,    vr7,     4
3866    z1_filter_edge_calc_loop2
3867    fst.d          f12,     t2,      32
3868
3869    //40-47
3870    vld            vr7,     a2,      38
3871    z1_filter_edge_data_init2
3872    vbsrl.v        vr13,    vr7,     4
3873    z1_filter_edge_calc_loop2
3874    fst.d          f12,     t2,      40
3875
3876    //48-55
3877    vld            vr7,     a2,      46
3878    z1_filter_edge_data_init2
3879    vbsrl.v        vr13,    vr7,     4
3880    z1_filter_edge_calc_loop2
3881    fst.d          f12,     t2,      48
3882
3883    //56-63
3884    vld            vr7,     a2,      54
3885    z1_filter_edge_data_init2
3886    vbsrl.v        vr13,    vr7,     4
3887    z1_filter_edge_calc_loop2
3888    fst.d          f12,     t2,      56
3889
3890    //64-71
3891    vld            vr7,     a2,      62
3892    z1_filter_edge_data_init2
3893    vbsrl.v        vr13,    vr7,     4
3894    z1_filter_edge_calc_loop2
3895    fst.d          f12,     t2,      64
3896
3897    //72-79
3898    vld            vr7,     a2,      70
3899    z1_filter_edge_data_init2
3900    vbsrl.v        vr13,    vr7,     4
3901    z1_filter_edge_calc_loop2
3902    fst.d          f12,     t2,      72
3903
3904    //80-87
3905    vld            vr7,     a2,      78
3906    z1_filter_edge_data_init2
3907    vbsrl.v        vr13,    vr7,     4
3908    z1_filter_edge_calc_loop2
3909    fst.d          f12,     t2,      80
3910
3911    //88-95
3912    vld            vr7,     a2,      86
3913    z1_filter_edge_data_init2
3914    vbsrl.v        vr13,    vr7,     4
3915    z1_filter_edge_calc_loop2
3916    fst.d          f12,     t2,      88
3917
3918    //96-103
3919    vld            vr7,     a2,      94
3920    z1_filter_edge_data_init2
3921    vbsrl.v        vr13,    vr7,     4
3922    z1_filter_edge_calc_loop2
3923    fst.d          f12,     t2,      96
3924
3925    //104-111
3926    vld            vr7,     a2,      102
3927    z1_filter_edge_data_init2
3928    vbsrl.v        vr13,    vr7,     4
3929    z1_filter_edge_calc_loop2
3930    fst.d          f12,     t2,      104
3931
3932    //112-119
3933    vld            vr7,     a2,      110
3934    z1_filter_edge_data_init2
3935    vbsrl.v        vr13,    vr7,     4
3936    z1_filter_edge_calc_loop2
3937    fst.d          f12,     t2,      112
3938
3939    //120-127
3940    vld            vr7,     a2,      118
3941    z1_filter_edge_data_init3
3942    vbsrl.v        vr13,    vr7,     4
3943    vextrins.b     vr13,    vr13,    0x65
3944    vextrins.b     vr13,    vr13,    0x75
3945    z1_filter_edge_calc_loop2
3946    fst.d          f12,     t2,      120
3947
3948.IPRED_Z1_FS_END:
3949    addi.d         t0,      a7,      -1   //max_base_x
3950    or             a7,      t2,      t2   //top
3951    b              .IPRED_Z1_UA_END
3952
3953.IPRED_Z1_NOTFS:
3954    or             a7,      a2,      a2   //top
3955    // imin_gr
3956    blt            a3,      a4,      .Z1_IMIN1
3957    or             t0,      a4,      a4
3958    b              .Z1_IMIN2
3959.Z1_IMIN1:
3960    or             t0,      a3,      a3
3961.Z1_IMIN2:
3962
3963    add.d          t0,      a3,      t0
3964    addi.d         t0,      t0,      -1   //max_base_x
3965
3966.IPRED_Z1_UA_END:
3967    //st dst, t1:dx  a2 a6 t6 t7
3968    beqz           t5,      .Z1_UA0
3969
3970    li.w           a5,      64
3971    vreplgr2vr.h   vr0,     a5
3972    vsrai.h        vr7,     vr0,     1
3973    or             t2,      zero,    zero  //y
3974    or             t3,      t1,      t1    //xpos
3975.Z1_LOOPY:
3976    andi           t4,      t3,      0x3e  //frac
3977    vreplgr2vr.h   vr1,     t4
3978    vsub.h         vr2,     vr0,     vr1
3979    or             a6,      zero,    zero  //x
3980    or             a2,      zero,    zero  //base_num
3981    srai.d         t6,      t3,      6     //base
3982
3983    or             t7,      t6,      t6
3984    bge            t7,      t0,      .Z1_LOOPX
3985.Z1_BASENUM:
3986    addi.d         a2,      a2,      1
3987    addi.d         t7,      t7,      2
3988    blt            t7,      t0,      .Z1_BASENUM
3989
3990.Z1_LOOPX:
3991    blt            a2,      a3,      .Z1_LOOPX_BASEMAX
3992
3993    srai.d         t8,      a3,      3  //loop param
3994    beqz           t8,      .Z1_LOOPX_W4
3995.Z1_LOOPX_W8:
3996    add.d          t5,      a7,      t6
3997    vld            vr3,     t5,      0
3998    vpickev.b      vr5,     vr3,     vr3  //0 2 4 6...
3999    vpickod.b      vr6,     vr3,     vr3  //1 3 5 7...
4000    vsllwil.hu.bu  vr5,     vr5,     0
4001    vsllwil.hu.bu  vr6,     vr6,     0
4002
4003    vmul.h         vr3,     vr5,     vr2
4004    vmadd.h        vr3,     vr6,     vr1
4005    vadd.h         vr3,     vr3,     vr7
4006    vsrai.h        vr3,     vr3,     6
4007    vsrlni.b.h     vr3,     vr3,     0
4008    fstx.d         f3,      a0,      a6
4009
4010    addi.d         a6,      a6,      8
4011    addi.d         t6,      t6,      16
4012    addi.d         t8,      t8,      -1
4013    bnez           t8,      .Z1_LOOPX_W8
4014    b              .Z1_LOOPY_END
4015.Z1_LOOPX_W4:
4016    vldx           vr3,     a7,      t6
4017    vsllwil.hu.bu  vr3,     vr3,     0
4018    vpickev.h      vr5,     vr3,     vr3  //0 2 4 6...
4019    vpickod.h      vr6,     vr3,     vr3  //1 3 5 7...
4020
4021    vmul.h         vr3,     vr5,     vr2
4022    vmadd.h        vr3,     vr6,     vr1
4023    vadd.h         vr3,     vr3,     vr7
4024    vsrai.h        vr3,     vr3,     6
4025    vsrlni.b.h     vr3,     vr3,     0
4026    fstx.s         f3,      a0,      a6
4027    b              .Z1_LOOPY_END
4028.Z1_LOOPX_BASEMAX:
4029    srai.d         t8,      a2,      3  //loop param
4030    beqz           t8,      .Z1_LOOPX_BASEMAX4
4031.Z1_LOOPX_BASEMAX8:
4032    add.d          t5,      a7,      t6
4033    vld            vr3,     t5,      0
4034    vpickev.b      vr5,     vr3,     vr3  //0 2 4 6...
4035    vpickod.b      vr6,     vr3,     vr3  //1 3 5 7...
4036    vsllwil.hu.bu  vr5,     vr5,     0
4037    vsllwil.hu.bu  vr6,     vr6,     0
4038
4039    vmul.h         vr3,     vr5,     vr2
4040    vmadd.h        vr3,     vr6,     vr1
4041    vadd.h         vr3,     vr3,     vr7
4042    vsrai.h        vr3,     vr3,     6
4043    vsrlni.b.h     vr3,     vr3,     0
4044    fstx.d         f3,      a0,      a6
4045
4046    addi.d         a6,      a6,      8
4047    addi.d         t6,      t6,      16
4048    addi.d         t8,      t8,      -1
4049    bnez           t8,      .Z1_LOOPX_BASEMAX8
4050.Z1_LOOPX_BASEMAX4:
4051    andi           t8,      a2,      4
4052    beqz           t8,      .Z1_LOOPX_BASEMAX2
4053
4054    vldx           vr3,     a7,      t6
4055    vsllwil.hu.bu  vr3,     vr3,     0
4056    vpickev.h      vr5,     vr3,     vr3  //0 2 4 6...
4057    vpickod.h      vr6,     vr3,     vr3  //1 3 5 7...
4058
4059    vmul.h         vr3,     vr5,     vr2
4060    vmadd.h        vr3,     vr6,     vr1
4061    vadd.h         vr3,     vr3,     vr7
4062    vsrai.h        vr3,     vr3,     6
4063    vsrlni.b.h     vr3,     vr3,     0
4064    fstx.s         f3,      a0,      a6
4065
4066    addi.d         a6,      a6,      4
4067    addi.d         t6,      t6,      8
4068.Z1_LOOPX_BASEMAX2:
4069    andi           t8,      a2,     2
4070    beqz           t8,      .Z1_LOOPX_BASEMAX1
4071
4072    vldx           vr3,     a7,      t6
4073    vsllwil.hu.bu  vr3,     vr3,     0
4074    vpickev.h      vr5,     vr3,     vr3  //0 2 4 6...
4075    vpickod.h      vr6,     vr3,     vr3  //1 3 5 7...
4076
4077    vmul.h         vr3,     vr5,     vr2
4078    vmadd.h        vr3,     vr6,     vr1
4079    vadd.h         vr3,     vr3,     vr7
4080    vsrai.h        vr3,     vr3,     6
4081    vsrlni.b.h     vr3,     vr3,     0
4082    vpickve2gr.bu  t7,      vr3,     0
4083    vpickve2gr.bu  t8,      vr3,     1
4084    stx.b          t7,      a0,      a6
4085    addi.d         a6,      a6,      1
4086    stx.b          t8,      a0,      a6
4087    addi.d         a6,      a6,      1
4088    addi.d         t6,      t6,      4
4089.Z1_LOOPX_BASEMAX1:
4090    andi           t8,      a2,     1
4091    beqz           t8,      .Z1_LOOPX_BASEMAX_MSET
4092
4093    add.d          a2,      a7,      t6
4094    sub.d          t7,      a5,      t4
4095    ld.bu          t8,      a2,      0
4096    mul.w          t7,      t7,      t8
4097    ld.bu          t8,      a2,      1
4098    mul.w          t8,      t8,      t4
4099    add.d          t7,      t7,      t8
4100    addi.d         t7,      t7,      32
4101    srai.d         t7,      t7,      6
4102    stx.b          t7,      a0,      a6
4103
4104    addi.d         a6,      a6,      1
4105.Z1_LOOPX_BASEMAX_MSET:  //memset
4106    add.d          t6,      a0,      a6  //dst
4107    add.d          t7,      a7,      t0  //src
4108    sub.d          a2,      a3,      a6  //size
4109    pixel_set_8bpc_allw t6, t7, a2, t8, t4
4110.Z1_LOOPY_END:
4111    addi.d         t2,      t2,      1
4112    add.d          a0,      a0,      a1
4113    add.d          t3,      t3,      t1
4114    blt            t2,      a4,      .Z1_LOOPY
4115    b              .Z1_END
4116
4117.Z1_UA0:
4118    li.w           a5,      64
4119    vreplgr2vr.h   vr0,     a5
4120    vsrai.h        vr7,     vr0,     1
4121    or             t2,      zero,    zero  //y
4122    or             t3,      t1,      t1    //xpos
4123.Z1_UA0_LOOPY:
4124    andi           t4,      t3,      0x3e  //frac
4125    vreplgr2vr.h   vr1,     t4
4126    vsub.h         vr2,     vr0,     vr1
4127    or             a6,      zero,    zero  //x
4128    srai.d         t6,      t3,      6     //base
4129
4130    sub.d          a2,      t0,      t6     //a2:base_num
4131    blt            a2,      zero,    .Z1_UA0_BASENUM
4132    b              .Z1_UA0_LOOPX
4133.Z1_UA0_BASENUM:
4134    or             a2,      zero,    zero
4135
4136.Z1_UA0_LOOPX:
4137    blt            a2,      a3,      .Z1_UA0_LOOPX_BASEMAX
4138
4139    srai.d         t8,      a3,      3  //loop param
4140    beqz           t8,      .Z1_UA0_LOOPX_W4
4141.Z1_UA0_LOOPX_W8:
4142    add.d          t5,      a7,      t6
4143    vld            vr5,     t5,      0
4144    vld            vr6,     t5,      1
4145    vsllwil.hu.bu  vr5,     vr5,     0
4146    vsllwil.hu.bu  vr6,     vr6,     0
4147
4148    vmul.h         vr3,     vr5,     vr2
4149    vmadd.h        vr3,     vr6,     vr1
4150    vadd.h         vr3,     vr3,     vr7
4151    vsrai.h        vr3,     vr3,     6
4152    vsrlni.b.h     vr3,     vr3,     0
4153    fstx.d         f3,      a0,      a6
4154
4155    addi.d         a6,      a6,      8
4156    addi.d         t6,      t6,      8
4157    addi.d         t8,      t8,      -1
4158    bnez           t8,      .Z1_UA0_LOOPX_W8
4159    b              .Z1_UA0_LOOPY_END
4160.Z1_UA0_LOOPX_W4:
4161    vldx           vr5,     a7,      t6
4162    vsllwil.hu.bu  vr5,     vr5,     0
4163    vbsrl.v        vr6,     vr5,     2
4164
4165    vmul.h         vr3,     vr5,     vr2
4166    vmadd.h        vr3,     vr6,     vr1
4167    vadd.h         vr3,     vr3,     vr7
4168    vsrai.h        vr3,     vr3,     6
4169    vsrlni.b.h     vr3,     vr3,     0
4170    fstx.s         f3,      a0,      a6
4171    b              .Z1_UA0_LOOPY_END
4172.Z1_UA0_LOOPX_BASEMAX:
4173    srai.d         t8,      a2,      3  //loop param
4174    beqz           t8,      .Z1_UA0_LOOPX_BASEMAX4
4175.Z1_UA0_LOOPX_BASEMAX8:
4176    add.d          t5,      a7,      t6
4177    vld            vr5,     t5,      0
4178    vld            vr6,     t5,      1
4179    vsllwil.hu.bu  vr5,     vr5,     0
4180    vsllwil.hu.bu  vr6,     vr6,     0
4181
4182    vmul.h         vr3,     vr5,     vr2
4183    vmadd.h        vr3,     vr6,     vr1
4184    vadd.h         vr3,     vr3,     vr7
4185    vsrai.h        vr3,     vr3,     6
4186    vsrlni.b.h     vr3,     vr3,     0
4187    fstx.d         f3,      a0,      a6
4188
4189    addi.d         a6,      a6,      8
4190    addi.d         t6,      t6,      8
4191    addi.d         t8,      t8,      -1
4192    bnez           t8,      .Z1_UA0_LOOPX_BASEMAX8
4193.Z1_UA0_LOOPX_BASEMAX4:
4194    andi           t8,      a2,      4
4195    beqz           t8,      .Z1_UA0_LOOPX_BASEMAX2
4196
4197    vldx           vr5,     a7,      t6
4198    vsllwil.hu.bu  vr5,     vr5,     0
4199    vbsrl.v        vr6,     vr5,     2
4200
4201    vmul.h         vr3,     vr5,     vr2
4202    vmadd.h        vr3,     vr6,     vr1
4203    vadd.h         vr3,     vr3,     vr7
4204    vsrai.h        vr3,     vr3,     6
4205    vsrlni.b.h     vr3,     vr3,     0
4206    fstx.s         f3,      a0,      a6
4207
4208    addi.d         a6,      a6,      4
4209    addi.d         t6,      t6,      4
4210.Z1_UA0_LOOPX_BASEMAX2:
4211    andi           t8,      a2,     2
4212    beqz           t8,      .Z1_UA0_LOOPX_BASEMAX1
4213
4214    vldx           vr5,     a7,      t6
4215    vsllwil.hu.bu  vr5,     vr5,     0
4216    vbsrl.v        vr6,     vr5,     2
4217
4218    vmul.h         vr3,     vr5,     vr2
4219    vmadd.h        vr3,     vr6,     vr1
4220    vadd.h         vr3,     vr3,     vr7
4221    vsrai.h        vr3,     vr3,     6
4222    vsrlni.b.h     vr3,     vr3,     0
4223    vpickve2gr.bu  t7,      vr3,     0
4224    vpickve2gr.bu  t8,      vr3,     1
4225    stx.b          t7,      a0,      a6
4226    addi.d         a6,      a6,      1
4227    stx.b          t8,      a0,      a6
4228    addi.d         a6,      a6,      1
4229    addi.d         t6,      t6,      2
4230.Z1_UA0_LOOPX_BASEMAX1:
4231    andi           t8,      a2,     1
4232    beqz           t8,      .Z1_UA0_LOOPX_BASEMAX_MSET
4233
4234    add.d          a2,      a7,      t6
4235    sub.d          t7,      a5,      t4
4236    ld.bu          t8,      a2,      0
4237    mul.w          t7,      t7,      t8
4238    ld.bu          t8,      a2,      1
4239    mul.w          t8,      t8,      t4
4240    add.d          t7,      t7,      t8
4241    addi.d         t7,      t7,      32
4242    srai.d         t7,      t7,      6
4243    stx.b          t7,      a0,      a6
4244
4245    addi.d         a6,      a6,      1
4246.Z1_UA0_LOOPX_BASEMAX_MSET:  //memset
4247    add.d          t6,      a0,      a6  //dst
4248    add.d          t7,      a7,      t0  //src
4249    sub.d          a2,      a3,      a6  //size
4250    pixel_set_8bpc_allw t6, t7, a2, t8, t4
4251.Z1_UA0_LOOPY_END:
4252    addi.d         t2,      t2,      1
4253    add.d          a0,      a0,      a1
4254    add.d          t3,      t3,      t1
4255    blt            t2,      a4,      .Z1_UA0_LOOPY
4256
4257.Z1_END:
4258    addi.d         sp,      sp,      128
4259endfunc
4260
4261