xref: /aosp_15_r20/external/libdav1d/src/loongarch/refmvs.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2023, VideoLAN and dav1d authors
3 * Copyright © 2023, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/loongarch/loongson_asm.S"
29
30/*
31static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
32                       const int bx4, const int bw4, int bh4)
33*/
34
35function splat_mv_lsx
36    vld           vr0,      a1,       0          // 0 1 ... 11 ...
37    clz.w         t4,       a3
38    vaddi.bu      vr1,      vr0,      0
39    addi.w        t4,       t4,       -26
40    vextrins.w    vr1,      vr0,      0x30       // 0 1 2 ... 11 0 1 2 3
41    la.local      t5,       .SPLAT_LSX_JRTABLE
42    vbsrl.v       vr2,      vr1,      4          // 4 5 6 7...11 0 1 2 3 0 0 0 0
43    alsl.d        t6,       t4,       t5,     1
44    vextrins.w    vr2,      vr0,      0x31       // 4 5 6 7...11 0 1 2 3 4 5 6 7
45    ld.h          t7,       t6,       0
46    vbsrl.v       vr3,      vr2,      4          // 8 9 10 11 0 1 2 3 4 5 6 7 0 0 0 0
47    add.d         t8,       t5,       t7
48    alsl.d        a2,       a2,       a2,     1
49    vextrins.w    vr3,      vr0,      0x32       // 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11
50    slli.w        a2,       a2,       2
51    jirl          $r0,      t8,       0
52
53.SPLAT_LSX_JRTABLE:
54    .hword .SPLAT_W32_LSX - .SPLAT_LSX_JRTABLE
55    .hword .SPLAT_W16_LSX - .SPLAT_LSX_JRTABLE
56    .hword .SPLAT_W8_LSX  - .SPLAT_LSX_JRTABLE
57    .hword .SPLAT_W4_LSX  - .SPLAT_LSX_JRTABLE
58    .hword .SPLAT_W2_LSX  - .SPLAT_LSX_JRTABLE
59    .hword .SPLAT_W1_LSX  - .SPLAT_LSX_JRTABLE
60
61.SPLAT_W1_LSX:
62    ld.d          t3,       a0,       0
63    addi.d        a0,       a0,       8
64    addi.d        a4,       a4,       -1
65    add.d         t3,       t3,       a2
66
67    fst.d         f1,       t3,       0
68    fst.s         f3,       t3,       8
69    blt           zero,     a4,       .SPLAT_W1_LSX
70    b             .splat_end
71.SPLAT_W2_LSX:
72    ld.d          t3,       a0,       0
73    addi.d        a0,       a0,       8
74    addi.d        a4,       a4,       -1
75    add.d         t3,       t3,       a2
76
77    vst           vr1,      t3,       0
78    fst.d         f2,       t3,       16
79    blt           zero,     a4,       .SPLAT_W2_LSX
80    b             .splat_end
81
82.SPLAT_W4_LSX:
83    ld.d          t3,       a0,       0
84    addi.d        a0,       a0,       8
85    addi.d        a4,       a4,       -1
86    add.d         t3,       t3,       a2
87
88    vst           vr1,      t3,       0
89    vst           vr2,      t3,       16
90    vst           vr3,      t3,       32
91    blt           zero,     a4,       .SPLAT_W4_LSX
92    b             .splat_end
93
94.SPLAT_W8_LSX:
95    ld.d          t3,       a0,       0
96    addi.d        a0,       a0,       8
97    addi.d        a4,       a4,       -1
98    add.d         t3,       t3,       a2
99
100    vst           vr1,      t3,       0
101    vst           vr2,      t3,       16
102    vst           vr3,      t3,       32
103
104    vst           vr1,      t3,       48
105    vst           vr2,      t3,       64
106    vst           vr3,      t3,       80
107    blt           zero,     a4,       .SPLAT_W8_LSX
108    b             .splat_end
109
110.SPLAT_W16_LSX:
111    ld.d          t3,       a0,       0
112    addi.d        a0,       a0,       8
113    addi.d        a4,       a4,       -1
114    add.d         t3,       t3,       a2
115
116.rept 2
117    vst           vr1,      t3,       0
118    vst           vr2,      t3,       16
119    vst           vr3,      t3,       32
120
121    vst           vr1,      t3,       48
122    vst           vr2,      t3,       64
123    vst           vr3,      t3,       80
124
125    addi.d        t3,       t3,       96
126.endr
127
128    blt           zero,     a4,       .SPLAT_W16_LSX
129    b             .splat_end
130
131.SPLAT_W32_LSX:
132    ld.d          t3,       a0,       0
133    addi.d        a0,       a0,       8
134    addi.d        a4,       a4,       -1
135    add.d         t3,       t3,       a2
136
137.rept 4
138    vst           vr1,      t3,       0
139    vst           vr2,      t3,       16
140    vst           vr3,      t3,       32
141
142    vst           vr1,      t3,       48
143    vst           vr2,      t3,       64
144    vst           vr3,      t3,       80
145
146    addi.d        t3,       t3,       96
147.endr
148
149    blt           zero,     a4,       .SPLAT_W32_LSX
150
151.splat_end:
152endfunc
153
154const la_div_mult
155.short    0, 16384, 8192, 5461, 4096, 3276, 2730, 2340
156.short 2048,  1820, 1638, 1489, 1365, 1260, 1170, 1092
157.short 1024,   963,  910,  862,  819,  780,  744,  712
158.short  682,   655,  630,  606,  585,  564,  546,  528
159endconst
160
161/*
162 *  temp reg: a6 a7
163 */
164.macro LOAD_SET_LOOP is_odd
165    slli.d          a6,      t6,     2
166    add.d           a6,      a6,     t6  // col_w * 5
1670:
168    addi.d          a7,      zero,   0   // x
169.if \is_odd
170    stx.w           t7,      t3,     a7
171    addi.d          a7,      a7,     5
172    bge             a7,      a6,     2f
173.endif
174
1751:
176    stx.w           t7,      t3,     a7
177    addi.d          a7,      a7,     5
178    stx.w           t7,      t3,     a7
179    addi.d          a7,      a7,     5
180    blt             a7,      a6,     1b
1812:
182    add.d           t3,      t3,     t2
183    addi.d          t5,      t5,     1
184    blt             t5,      a5,     0b
185.endm
186
187/*
188 * static void load_tmvs_c(const refmvs_frame *const rf, int tile_row_idx,
189 *                         const int col_start8, const int col_end8,
190 *                         const int row_start8, int row_end8)
191 */
192function load_tmvs_lsx
193    addi.d         sp,      sp,       -80
194    st.d           s0,      sp,       0
195    st.d           s1,      sp,       8
196    st.d           s2,      sp,       16
197    st.d           s3,      sp,       24
198    st.d           s4,      sp,       32
199    st.d           s5,      sp,       40
200    st.d           s6,      sp,       48
201    st.d           s7,      sp,       56
202    st.d           s8,      sp,       64
203
204    vld           vr16,     a0,       16
205    vld           vr0,      a0,       52    // rf->mfmv_ref
206    ld.w          s8,       a0,       152   // [0] - rf->n_mfmvs
207    vld           vr17,     a0,       168   // [0] - rp_ref| [1]- rp_proj
208    ld.d          t1,       a0,       184   // stride
209    ld.w          t0,       a0,       200
210    addi.w        t0,       t0,       -1
211    bnez          t0,       1f
212    addi.w        a1,       zero,     0
2131:
214    addi.d        t0,       a3,       8
215    vinsgr2vr.w   vr1,      t0,       0
216    vinsgr2vr.w   vr1,      a5,       1
217    vmin.w        vr1,      vr1,      vr16  // [0] col_end8i [1] row_end8
218    addi.d        t0,       a2,       -8
219    bge           t0,       zero,     2f
220    addi.w        t0,       zero,     0     // t0 col_start8i
2212:
222    vpickve2gr.d  t4,       vr17,     1     // rf->rp_proj
223    slli.d        t2,       t1,       2
224    add.d         t2,       t2,       t1    // stride * 5
225    slli.d        a1,       a1,       4     // tile_row_idx * 16
226    andi          t3,       a4,       0xf
227    add.d         t3,       t3,       a1    // tile_row_idx * 16 + row_start8 & 15
228    mul.w         t3,       t3,       t2
229    mul.w         t8,       a1,       t2
230    vpickve2gr.w  a5,       vr1,      1
231    addi.d        t5,       a4,       0
232    sub.d         t6,       a3,       a2     // col_end8 - col_start8
233    li.w          t7,       0x80008000
234    slli.d        a7,       a2,       2
235    add.d         t3,       t3,       a2
236    add.d         t3,       t3,       a7
237    add.d         t3,       t3,       t4     // rp_proj
238    andi          a6,       t6,       1
239    bnez          a6,       3f
240    LOAD_SET_LOOP 0
241    b             4f
2423:
243    LOAD_SET_LOOP 1
2444:
245    addi.d        a6,       zero,     0      // n
246    bge           a6,       s8,       .end_load
247    add.d         t3,       t8,       t4     // rp_proj
248    mul.w         t6,       a4,       t2
249    addi.d        s7,       zero,     40
250    vpickve2gr.w  t1,       vr1,      0      // col_end8i
251    vbsrl.v       vr2,      vr0,      4      // rf->mfmv_ref2cur
252    addi.d        t5,       a0,       64     // rf->mfmv_ref2ref
253    la.local      t8,       la_div_mult
254    vld           vr6,      t8,       0
255    vld           vr7,      t8,       16
256    vld           vr8,      t8,       32
257    vld           vr9,      t8,       48
258    li.w          t8,       0x3fff
259    vreplgr2vr.h  vr21,     t8
260    vxor.v        vr18,     vr18,     vr18   // zero
261    vsub.h        vr20,     vr18,     vr21
262    vpickev.b     vr12,     vr7,      vr6
263    vpickod.b     vr13,     vr7,      vr6
264    vpickev.b     vr14,     vr9,      vr8
265    vpickod.b     vr15,     vr9,      vr8
266    vpickve2gr.d  s6,       vr17,     0       // rf->rp_ref
2675:
268    vld           vr10,     t5,       0
269    vld           vr11,     t5,       16
270    vpickev.h     vr10,     vr11,     vr10
271    vpickev.b     vr10,     vr11,     vr10    // [1...7]
272
273    vbsrl.v       vr0,      vr0,      1
274    vpickve2gr.wu t8,       vr2,      0       // ref2cur
275    vbsrl.v       vr2,      vr2,      4
276    srli.d        t4,       t8,       24
277    xori          t4,       t4,       0x80
278    beqz          t4,       8f
279
280    vreplgr2vr.h  vr23,     t8
281    vshuf.b       vr6,      vr14,     vr12,    vr10
282    vshuf.b       vr7,      vr15,     vr13,    vr10
283    vilvl.b       vr8,      vr7,      vr6
284    vmulwev.w.h   vr6,      vr8,      vr23
285    vmulwod.w.h   vr7,      vr8,      vr23
286
287    vpickve2gr.b  s0,       vr0,      0       // ref
288    slli.d        t8,       s0,       3
289    ldx.d         s1,       s6,       t8      // rf->rp_ref[ref]
290    addi.d        s0,       s0,       -4      // ref_sign
291    vreplgr2vr.h  vr19,     s0
292    add.d         s1,       s1,       t6      // &rf->rp_ref[ref][row_start8 * stride]
293    addi.d        s2,       a4,       0       // y
294    vilvl.w       vr8,      vr7,      vr6
295    vilvh.w       vr9,      vr7,      vr6
2966:                                            // for (int y = row_start8;
297    andi          s3,       s2,       0xff8
298
299    addi.d        s4,       s3,       8
300    blt           a4,       s3,       0f
301    addi.d        s3,       a4,       0        // y_proj_start
3020:
303    blt           s4,       a5,       0f
304    addi.d        s4,       a5,       0        // y_proj_end
3050:
306    addi.d        s5,       t0,       0        // x
3077:                                             // for (int x = col_start8i;
308    slli.d        a7,       s5,       2
309    add.d         a7,       a7,       s5
310    add.d         a7,       s1,       a7      // rb
311    vld           vr3,      a7,       0       // [rb]
312    vpickve2gr.b  t4,       vr3,      4       // b_ref
313    beqz          t4,       .end_x
314    vreplve.b     vr11,     vr10,     t4
315    vpickve2gr.b  t7,       vr11,     4       // ref2ref
316    beqz          t7,       .end_x
317    vsllwil.w.h   vr4,      vr3,      0
318    vreplgr2vr.w  vr6,      t4
319    vshuf.w       vr6,      vr9,      vr8      // frac
320    vmul.w        vr5,      vr6,      vr4
321    vsrai.w       vr4,      vr5,      31
322    vadd.w        vr4,      vr4,      vr5
323    vssrarni.h.w  vr4,      vr4,      14
324    vclip.h       vr4,      vr4,      vr20,    vr21  // offset
325    vxor.v        vr5,      vr4,      vr19    // offset.x ^ ref_sign
326    vori.b        vr5,      vr5,      0x1     // offset.x ^ ref_sign
327    vabsd.h       vr4,      vr4,      vr18
328    vsrli.h       vr4,      vr4,      6       // abs(offset.x) >> 6
329    vsigncov.h    vr4,      vr5,      vr4     // apply_sign
330    vpickve2gr.h  s0,       vr4,      0
331    add.d         s0,       s2,       s0      // pos_y
332    blt           s0,       s3,       .n_posy
333    bge           s0,       s4,       .n_posy
334    andi          s0,       s0,       0xf
335    mul.w         s0,       s0,       t2      // pos
336    vpickve2gr.h  t7,       vr4,      1
337    add.d         t7,       t7,       s5      // pos_x
338    add.d         s0,       t3,       s0      // rp_proj + pos
339
340.loop_posx:
341    andi          t4,       s5,       0xff8 // x_sb_align
342
343    blt           t7,       a2,       .n_posx
344    addi.d        t8,       t4,       -8
345    blt           t7,       t8,       .n_posx
346
347    bge           t7,       a3,       .n_posx
348    addi.d        t4,       t4,       16
349    bge           t7,       t4,       .n_posx
350
351    slli.d        t4,       t7,       2
352    add.d         t4,       t4,       t7      // pos_x * 5
353    add.d         t4,       s0,       t4      // rp_proj[pos + pos_x]
354    vstelm.w      vr3,      t4,       0,   0
355    vstelm.b      vr11,     t4,       4,   4
356
357.n_posx:
358    addi.d        s5,       s5,       1       // x + 1
359    bge           s5,       t1,       .ret_posx
360    addi.d        a7,       a7,       5       // rb + 1
361    vld           vr4,      a7,       0       // [rb]
362    vseq.b        vr5,      vr4,      vr3
363
364    vpickve2gr.d  t8,       vr5,      0
365    cto.d         t8,       t8
366    blt           t8,       s7,       7b
367
368    addi.d        t7,       t7,       1       // pos_x + 1
369
370    /*  Core computing loop expansion(sencond)  */
371    andi          t4,       s5,       0xff8 // x_sb_align
372
373    blt           t7,       a2,       .n_posx
374    addi.d        t8,       t4,       -8
375    blt           t7,       t8,       .n_posx
376
377    bge           t7,       a3,       .n_posx
378    addi.d        t4,       t4,       16
379    bge           t7,       t4,       .n_posx
380
381    slli.d        t4,       t7,       2
382    add.d         t4,       t4,       t7      // pos_x * 5
383    add.d         t4,       s0,       t4      // rp_proj[pos + pos_x]
384    vstelm.w      vr3,      t4,       0,   0
385    vstelm.b      vr11,     t4,       4,   4
386
387    addi.d        s5,       s5,       1       // x + 1
388    bge           s5,       t1,       .ret_posx
389    addi.d        a7,       a7,       5       // rb + 1
390    vld           vr4,      a7,       0       // [rb]
391    vseq.b        vr5,      vr4,      vr3
392
393    vpickve2gr.d  t8,       vr5,      0
394    cto.d         t8,       t8
395    blt           t8,       s7,       7b
396
397    addi.d        t7,       t7,       1       // pos_x + 1
398
399    /*  Core computing loop expansion(third)  */
400    andi          t4,       s5,       0xff8 // x_sb_align
401
402    blt           t7,       a2,       .n_posx
403    addi.d        t8,       t4,       -8
404    blt           t7,       t8,       .n_posx
405
406    bge           t7,       a3,       .n_posx
407    addi.d        t4,       t4,       16
408    bge           t7,       t4,       .n_posx
409
410    slli.d        t4,       t7,       2
411    add.d         t4,       t4,       t7      // pos_x * 5
412    add.d         t4,       s0,       t4      // rp_proj[pos + pos_x]
413    vstelm.w      vr3,      t4,       0,   0
414    vstelm.b      vr11,     t4,       4,   4
415
416    addi.d        s5,       s5,       1       // x + 1
417    bge           s5,       t1,       .ret_posx
418    addi.d        a7,       a7,       5       // rb + 1
419    vld           vr4,      a7,       0       // [rb]
420    vseq.b        vr5,      vr4,      vr3
421
422    vpickve2gr.d  t8,       vr5,      0
423    cto.d         t8,       t8
424    blt           t8,       s7,       7b
425
426    addi.d        t7,       t7,       1       // pos_x + 1
427
428    b             .loop_posx
429
430.n_posy:
431    addi.d        s5,       s5,       1       // x + 1
432    bge           s5,       t1,       .ret_posx
433    addi.d        a7,       a7,       5       // rb + 1
434    vld           vr4,      a7,       0       // [rb]
435    vseq.b        vr5,      vr4,      vr3
436
437    vpickve2gr.d  t8,       vr5,      0
438    cto.d         t8,       t8
439    blt           t8,       s7,       7b
440
441    addi.d        s5,       s5,       1       // x + 1
442    bge           s5,       t1,       .ret_posx
443    addi.d        a7,       a7,       5       // rb + 1
444    vld           vr4,      a7,       0       // [rb]
445    vseq.b        vr5,      vr4,      vr3
446
447    vpickve2gr.d  t8,       vr5,      0
448    cto.d         t8,       t8
449    blt           t8,       s7,       7b
450
451    b             .n_posy
452
453.end_x:
454    addi.d        s5,       s5,       1       // x + 1
455    blt           s5,       t1,       7b
456
457.ret_posx:
458    add.d         s1,       s1,       t2      // r + stride
459    addi.d        s2,       s2,       1       // y + 1
460    blt           s2,       a5,       6b
4618:
462    addi.d        a6,       a6,       1       // n + 1
463    addi.d        t5,       t5,       28      // mfmv_ref2ref(offset) + 28
464    blt           a6,       s8,       5b
465
466.end_load:
467    ld.d           s0,      sp,       0
468    ld.d           s1,      sp,       8
469    ld.d           s2,      sp,       16
470    ld.d           s3,      sp,       24
471    ld.d           s4,      sp,       32
472    ld.d           s5,      sp,       40
473    ld.d           s6,      sp,       48
474    ld.d           s7,      sp,       56
475    ld.d           s8,      sp,       64
476    addi.d         sp,      sp,       80
477endfunc
478
479const mv_tbls
480    .byte           255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
481    .byte           0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
482    .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
483    .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
484endconst
485
486const mask_mult
487    .byte           1, 0, 2, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0
488endconst
489
490const mask_mv0
491    .byte           1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
492endconst
493
494const mask_mv1
495    .byte           4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
496endconst
497
498// void dav1d_save_tmvs_lsx(refmvs_temporal_block *rp, ptrdiff_t stride,
499//                          refmvs_block **rr, const uint8_t *ref_sign,
500//                          int col_end8, int row_end8,
501//                          int col_start8, int row_start8)
502function save_tmvs_lsx
503    addi.d      sp,         sp,        -0x28
504    st.d        s0,         sp,         0x00
505    st.d        s1,         sp,         0x08
506    st.d        s2,         sp,         0x10
507    st.d        s3,         sp,         0x18
508    st.d        s4,         sp,         0x20
509    move        t0,         ra
510
511    vxor.v      vr10,       vr10,       vr10
512    vld         vr11,       a3,         0       // Load ref_sign[0] ~ Load ref_sign[7]
513    la.local    t2,         .save_tevs_tbl
514    la.local    s1,         mask_mult
515    la.local    t7,         mv_tbls
516    vld         vr9,        s1,         0       // Load mask_mult
517    vslli.d     vr11,       vr11,       8       // 0, ref_sign[0], ... ,ref_sign[6]
518    la.local    s3,         mask_mv0
519    vld         vr8,        s3,         0       // Load mask_mv0
520    la.local    s4,         mask_mv1
521    vld         vr7,        s4,         0       // Load mask_mv1
522    li.d        s0,         5
523    li.d        t8,         12 * 2
524    mul.d       a1,         a1,         s0     // stride *= 5
525    sub.d       a5,         a5,         a7      // h = row_end8 - row_start8
526    slli.d      a7,         a7,         1       // row_start8 <<= 1
5271:
528    li.d        s0,         5
529    andi        t3,         a7,         30      // (y & 15) * 2
530    slli.d      s4,         t3,         3
531    ldx.d       t3,         a2,         s4      // b = rr[(y & 15) * 2]
532    addi.d      t3,         t3,         12      // &b[... + 1]
533    mul.d       s4,         a4,         t8
534    add.d       t4,         s4,         t3      // end_cand_b = &b[col_end8*2 + 1]
535    mul.d       s3,         a6,         t8
536    add.d       t3,         s3,         t3      // cand_b = &b[x*2 + 1]
537    mul.d       s4,         a6,         s0
538    add.d       a3,         s4,         a0      // &rp[x]
5392:
540    /* First cand_b */
541    ld.b        t5,         t3,         10      // cand_b->bs
542    vld         vr0,        t3,         0       // cand_b->mv and ref
543    alsl.d      t5,         t5,         t2,     2  // bt2 index
544    ld.h        s3,         t3,         8       // cand_b->ref
545    ld.h        t6,         t5,         0       // bt2
546    move        s0,         t2
547    alsl.d      t3,         t6,         t3,     1   // Next cand_b += bt2 * 2
548    vor.v       vr2,        vr0,        vr0
549    vinsgr2vr.h vr1,        s3,         0
550    move        t1 ,        t3
551    bge         t3,         t4,        3f
552
553    /* Next cand_b */
554    ld.b        s0,         t3,         10      // cand_b->bs
555    vld         vr4,        t3,         0       // cand_b->mv and ref
556    alsl.d      s0,         s0,         t2,     2 // bt2 index
557    ld.h        s4,         t3,         8       // cand_b->ref
558    ld.h        t6,         s0,         0       // bt2
559    alsl.d      t3,         t6,         t3,     1   // Next cand_b += bt2*2
560    vpackev.d   vr2,        vr4,        vr0     // a0.mv[0] a0.mv[1] a1.mv[0], a1.mv[1]
561    vinsgr2vr.h vr1,        s4,         1   // a0.ref[0] a0.ref[1], a1.ref[0], a1.ref[1]
5623:
563    vabsd.h     vr2,        vr2,        vr10    // abs(mv[].xy)
564    vsle.b      vr16,       vr10,       vr1
565    vand.v      vr1,        vr16,       vr1
566    vshuf.b     vr1,        vr11,       vr11,   vr1     // ref_sign[ref]
567    vsrli.h     vr2,        vr2,        12      // abs(mv[].xy) >> 12
568    vilvl.b     vr1,        vr1,        vr1
569    vmulwev.h.bu    vr1,    vr1,        vr9    // ef_sign[ref] * {1, 2}
570
571    vseqi.w     vr2,        vr2,        0       // abs(mv[].xy) <= 4096
572    vpickev.h   vr2,        vr2,        vr2     // abs() condition to 16 bit
573
574    vand.v      vr1,        vr2,        vr1     // h[0-3] contains conditions for mv[0-1]
575    vhaddw.wu.hu    vr1,    vr1,        vr1     // Combine condition for [1] and [0]
576    vpickve2gr.wu   s1,     vr1,        0       // Extract case for first block
577    vpickve2gr.wu   s2,     vr1,        1
578
579    ld.hu           t5,     t5,         2       // Fetch jump table entry
580    ld.hu           s0,     s0,         2
581    alsl.d          s3,     s1,         t7,    4   // Load permutation table base on case
582    vld             vr1,    s3,         0
583    alsl.d          s4,     s2,         t7,    4
584    vld             vr5,    s4,         0
585    sub.d           t5,     t2,         t5     // Find jump table target
586    sub.d           s0,     t2,         s0
587
588    vshuf.b         vr0,    vr0,        vr0,    vr1 // Permute cand_b to output refmvs_temporal_block
589    vshuf.b         vr4,    vr4,        vr4,    vr5
590    vsle.b          vr16,   vr10,       vr1
591    vand.v          vr0,    vr16,       vr0
592
593    vsle.b          vr17,   vr10,       vr5
594    vand.v          vr4,    vr17,       vr4
595    // v1 follows on v0, with another 3 full repetitions of the pattern.
596    vshuf.b         vr1,    vr0,        vr0,    vr8 // 1, 2, 3, ... , 15, 16
597    vshuf.b         vr5,    vr4,        vr4,    vr8 // 1, 2, 3, ... , 15, 16
598    // v2 ends with 3 complete repetitions of the pattern.
599    vshuf.b         vr2,    vr1,        vr0,    vr7
600    vshuf.b         vr6,    vr5,        vr4,    vr7    // 4, 5, 6, 7, ... , 12, 13, 14, 15, 16, 17, 18, 19
601
602    jirl            ra,     t5,         0
603    bge             t1 ,    t4,         4f      // if (cand_b >= end)
604    vor.v           vr0,    vr4,        vr4
605    vor.v           vr1,    vr5,        vr5
606    vor.v           vr2,    vr6,        vr6
607    jirl            ra,     s0,         0
608    blt             t3,     t4,         2b      // if (cand_b < end)
609
6104:
611    addi.d          a5,     a5,         -1      // h--
612    addi.d          a7,     a7,         2       // y += 2
613    add.d           a0,     a0,         a1      // rp += stride
614    blt             zero,   a5,         1b
615
616    ld.d        s0,         sp,         0x00
617    ld.d        s1,         sp,         0x08
618    ld.d        s2,         sp,         0x10
619    ld.d        s3,         sp,         0x18
620    ld.d        s4,         sp,         0x20
621    addi.d      sp,         sp,         0x28
622
623    move            ra,     t0
624    jirl            zero,   ra,         0x00
625
62610:
627    addi.d          s1,     a3,         4
628    vstelm.w        vr0,    a3,         0,      0   // .mv
629    vstelm.b        vr0,    s1,         0,      4   // .ref
630    addi.d          a3,     a3,         5
631    jirl            zero,   ra,         0x00
63220:
633    addi.d          s1,     a3,         8
634    vstelm.d        vr0,    a3,         0,      0   // .mv
635    vstelm.h        vr0,    s1,         0,      4   // .ref
636    addi.d          a3,     a3,         2 * 5
637    jirl            zero,   ra,         0x00
63840:
639    vst             vr0,    a3,         0
640    vstelm.w        vr1,    a3,         0x10,   0
641    addi.d          a3,     a3,         4 * 5
642    jirl            zero,   ra,         0x00
643
64480:
645    vst             vr0,    a3,         0
646    vst             vr1,    a3,         0x10           // This writes 6 full entries plus 2 extra bytes
647    vst             vr2,    a3,         5 * 8 - 16     // Write the last few, overlapping with the first write.
648    addi.d          a3,     a3,         8 * 5
649    jirl            zero,   ra,         0x00
650160:
651    addi.d          s1,     a3,         6 * 5
652    addi.d          s2,     a3,         12 * 5
653    vst             vr0,    a3,         0
654    vst             vr1,    a3,         0x10          // This writes 6 full entries plus 2 extra bytes
655    vst             vr0,    a3,         6 * 5
656    vst             vr1,    a3,         6 * 5 + 16    // Write another 6 full entries, slightly overlapping with the first set
657    vstelm.d        vr0,    s2,         0,      0     // Write 8 bytes (one full entry) after the first 12
658    vst             vr2,    a3,         5 * 16 - 16   // Write the last 3 entries
659    addi.d          a3,     a3,         16 * 5
660    jirl            zero,   ra,         0x00
661
662.save_tevs_tbl:
663        .hword 16 * 12   // bt2 * 12, 12 is sizeof(refmvs_block)
664        .hword .save_tevs_tbl - 160b
665        .hword 16 * 12
666        .hword .save_tevs_tbl - 160b
667        .hword 8 * 12
668        .hword .save_tevs_tbl -  80b
669        .hword 8 * 12
670        .hword .save_tevs_tbl -  80b
671        .hword 8 * 12
672        .hword .save_tevs_tbl -  80b
673        .hword 8 * 12
674        .hword .save_tevs_tbl -  80b
675        .hword 4 * 12
676        .hword .save_tevs_tbl -  40b
677        .hword 4 * 12
678        .hword .save_tevs_tbl -  40b
679        .hword 4 * 12
680        .hword .save_tevs_tbl -  40b
681        .hword 4 * 12
682        .hword .save_tevs_tbl -  40b
683        .hword 2 * 12
684        .hword .save_tevs_tbl -  20b
685        .hword 2 * 12
686        .hword .save_tevs_tbl -  20b
687        .hword 2 * 12
688        .hword .save_tevs_tbl -  20b
689        .hword 2 * 12
690        .hword .save_tevs_tbl -  20b
691        .hword 2 * 12
692        .hword .save_tevs_tbl -  20b
693        .hword 1 * 12
694        .hword .save_tevs_tbl -  10b
695        .hword 1 * 12
696        .hword .save_tevs_tbl -  10b
697        .hword 1 * 12
698        .hword .save_tevs_tbl -  10b
699        .hword 1 * 12
700        .hword .save_tevs_tbl -  10b
701        .hword 1 * 12
702        .hword .save_tevs_tbl -  10b
703        .hword 1 * 12
704        .hword .save_tevs_tbl -  10b
705        .hword 1 * 12
706        .hword .save_tevs_tbl -  10b
707endfunc
708
709