xref: /aosp_15_r20/external/libdav1d/src/loongarch/msac.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2023, VideoLAN and dav1d authors
3 * Copyright © 2023, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "loongson_asm.S"
29
30const min_prob
31  .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
32endconst
33
34const ph_0xff00
35.rept 8
36  .short 0xff00
37.endr
38endconst
39
40.macro decode_symbol_adapt w
41    addi.d          sp,      sp,     -48
42    vldrepl.h       vr0,     a0,      24   //rng
43    fst.s           f0,      sp,      0    //val==0
44    vld             vr1,     a1,      0    //cdf
45.if \w == 16
46    vld             vr11,    a1,      16
47.endif
48    vldrepl.d       vr2,     a0,      16   //dif
49    ld.w            t1,      a0,      32   //allow_update_cdf
50    la.local        t2,      min_prob
51    addi.d          t2,      t2,      30
52    slli.w          t3,      a2,      1
53    sub.d           t2,      t2,      t3
54    vld             vr3,     t2,      0    //min_prob
55.if \w == 16
56    vld             vr13,    t2,      16
57.endif
58    vsrli.h         vr4,     vr0,     8    //r = s->rng >> 8
59    vslli.h         vr4,     vr4,     8    //r << 8
60    vsrli.h         vr5,     vr1,     6
61    vslli.h         vr5,     vr5,     7
62.if \w == 16
63    vsrli.h         vr15,    vr11,    6
64    vslli.h         vr15,    vr15,    7
65.endif
66    vmuh.hu         vr5,     vr4,     vr5
67    vadd.h          vr5,     vr5,     vr3  //v
68.if \w == 16
69    vmuh.hu         vr15,    vr4,     vr15
70    vadd.h          vr15,    vr15,    vr13
71.endif
72    addi.d          t8,      sp,      2
73    vst             vr5,     t8,      0    //store v
74.if \w == 16
75    vst             vr15,    t8,      16
76.endif
77    vreplvei.h      vr20,    vr2,     3    //c
78    vsle.hu         vr6,     vr5,     vr20
79.if \w == 16
80    vsle.hu         vr16,    vr15,    vr20
81    vpickev.b       vr21,    vr16,    vr6
82.endif
83.if \w <= 8
84    vmskltz.h       vr10,    vr6
85.else
86    vmskltz.b       vr10,    vr21
87.endif
88    beqz            t1,      .renorm\()\w
89
90    // update_cdf
91    alsl.d          t1,      a2,      a1,   1
92    ld.h            t2,      t1,      0    //count
93    srli.w          t3,      t2,      4    //count >> 4
94.if \w == 16
95    addi.w          t3,      t3,      5    //rate
96.else
97    addi.w          t3,      t3,      4
98    li.w            t5,      2
99    sltu            t5,      t5,      a2
100    add.w           t3,      t3,      t5   //rate
101.endif
102    sltui           t5,      t2,      32
103    add.w           t2,      t2,      t5   //count + (count < 32)
104    vreplgr2vr.h    vr9,     t3
105    vseq.h          vr7,     vr7,     vr7
106    vavgr.hu        vr5,     vr6,     vr7  //i >= val ? -1 : 32768
107    vsub.h          vr5,     vr5,     vr1
108    vsub.h          vr8,     vr1,     vr6
109.if \w == 16
110    vavgr.hu        vr15,    vr16,    vr7
111    vsub.h          vr15,    vr15,    vr11
112    vsub.h          vr18,    vr11,    vr16
113.endif
114    vsra.h          vr5,     vr5,     vr9
115    vadd.h          vr8,     vr8,     vr5
116.if \w == 4
117    fst.d           f8,      a1,      0
118.else
119    vst             vr8,     a1,      0
120.endif
121.if \w == 16
122    vsra.h          vr15,    vr15,    vr9
123    vadd.h          vr18,    vr18,    vr15
124    vst             vr18,    a1,      16
125.endif
126    st.h            t2,      t1,      0
127
128.renorm\()\w:
129    vpickve2gr.h    t3,      vr10,    0
130    ctz.w           a7,      t3            // ret
131    alsl.d          t3,      a7,      t8,      1
132    ld.hu           t4,      t3,      0    // v
133    ld.hu           t5,      t3,      -2   // u
134    sub.w           t5,      t5,      t4   // rng
135    slli.d          t4,      t4,      48
136    vpickve2gr.d    t6,      vr2,     0
137    sub.d           t6,      t6,      t4   // dif
138    clz.w           t4,      t5            // d
139    xori            t4,      t4,      16   // d
140    sll.d           t6,      t6,      t4
141    ld.w            t0,      a0,      28   //cnt
142    sll.w           t5,      t5,      t4
143    sub.w           t7,      t0,      t4   // cnt-d
144    st.w            t5,      a0,      24   // store rng
145    bgeu            t0,      t4,      9f
146
147    // refill
148    ld.d            t0,      a0,      0    // buf_pos
149    ld.d            t1,      a0,      8    // buf_end
150    addi.d          t2,      t0,      8
151    bltu            t1,      t2,      2f
152
153    ld.d            t3,      t0,      0    // next_bits
154    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
155    nor             t3,      t3,      t3
156    sub.w           t2,      zero,    t1
157    revb.d          t3,      t3            // next_bits = bswap(next_bits)
158    srli.w          t2,      t2,      3    // num_bytes_read
159    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
160    b               3f
1611:
162    addi.w          t3,      t7,      -48
163    srl.d           t3,      t3,      t3   // pad with ones
164    b               4f
1652:
166    bgeu            t0,      t1,      1b
167    ld.d            t3,      t1,      -8   // next_bits
168    sub.w           t2,      t2,      t1
169    sub.w           t1,      t1,      t0   // num_bytes_left
170    slli.w          t2,      t2,      3
171    srl.d           t3,      t3,      t2
172    addi.w          t2,      t7,      -48
173    nor             t3,      t3,      t3
174    sub.w           t4,      zero,    t2
175    revb.d          t3,      t3
176    srli.w          t4,      t4,      3
177    srl.d           t3,      t3,      t2
178    sltu            t2,      t1,      t4
179    maskeqz         t1,      t1,      t2
180    masknez         t2,      t4,      t2
181    or              t2,      t2,      t1   // num_bytes_read
1823:
183    slli.w          t1,      t2,      3
184    add.d           t0,      t0,      t2
185    add.w           t7,      t7,      t1   // cnt += num_bits_read
186    st.d            t0,      a0,      0
1874:
188    or              t6,      t6,      t3   // dif |= next_bits
1899:
190    st.w            t7,      a0,      28   // store cnt
191    st.d            t6,      a0,      16   // store dif
192    move            a0,      a7
193    addi.d          sp,      sp,      48
194.endm
195
196function msac_decode_symbol_adapt4_lsx
197    decode_symbol_adapt 4
198endfunc
199
200function msac_decode_symbol_adapt8_lsx
201    decode_symbol_adapt 8
202endfunc
203
204function msac_decode_symbol_adapt16_lsx
205    decode_symbol_adapt 16
206endfunc
207
208function msac_decode_bool_lsx
209    ld.w            t0,      a0,      24   // rng
210    srli.w          a1,      a1,      6
211    ld.d            t1,      a0,      16   // dif
212    srli.w          t2,      t0,      8    // r >> 8
213    mul.w           t2,      t2,      a1
214    ld.w            a5,      a0,      28   // cnt
215    srli.w          t2,      t2,      1
216    addi.w          t2,      t2,      4    // v
217    slli.d          t3,      t2,      48   // vw
218    sltu            t4,      t1,      t3
219    move            t8,      t4            // ret
220    xori            t4,      t4,      1
221    maskeqz         t6,      t3,      t4   // if (ret) vw
222    sub.d           t6,      t1,      t6   // dif
223    slli.w          t5,      t2,      1
224    sub.w           t5,      t0,      t5   // r - 2v
225    maskeqz         t7,      t5,      t4   // if (ret) r - 2v
226    add.w           t5,      t2,      t7   // v(rng)
227
228    // renorm
229    clz.w           t4,      t5            // d
230    xori            t4,      t4,      16   // d
231    sll.d           t6,      t6,      t4
232    sll.w           t5,      t5,      t4
233    sub.w           t7,      a5,      t4   // cnt-d
234    st.w            t5,      a0,      24   // store rng
235    bgeu            a5,      t4,      9f
236
237    // refill
238    ld.d            t0,      a0,      0    // buf_pos
239    ld.d            t1,      a0,      8    // buf_end
240    addi.d          t2,      t0,      8
241    bltu            t1,      t2,      2f
242
243    ld.d            t3,      t0,      0    // next_bits
244    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
245    nor             t3,      t3,      t3
246    sub.w           t2,      zero,    t1
247    revb.d          t3,      t3            // next_bits = bswap(next_bits)
248    srli.w          t2,      t2,      3    // num_bytes_read
249    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
250    b               3f
2511:
252    addi.w          t3,      t7,      -48
253    srl.d           t3,      t3,      t3   // pad with ones
254    b               4f
2552:
256    bgeu            t0,      t1,      1b
257    ld.d            t3,      t1,      -8   // next_bits
258    sub.w           t2,      t2,      t1
259    sub.w           t1,      t1,      t0   // num_bytes_left
260    slli.w          t2,      t2,      3
261    srl.d           t3,      t3,      t2
262    addi.w          t2,      t7,      -48
263    nor             t3,      t3,      t3
264    sub.w           t4,      zero,    t2
265    revb.d          t3,      t3
266    srli.w          t4,      t4,      3
267    srl.d           t3,      t3,      t2
268    sltu            t2,      t1,      t4
269    maskeqz         t1,      t1,      t2
270    masknez         t2,      t4,      t2
271    or              t2,      t2,      t1   // num_bytes_read
2723:
273    slli.w          t1,      t2,      3
274    add.d           t0,      t0,      t2
275    add.w           t7,      t7,      t1   // cnt += num_bits_read
276    st.d            t0,      a0,      0
2774:
278    or              t6,      t6,      t3   // dif |= next_bits
2799:
280    st.w            t7,      a0,      28   // store cnt
281    st.d            t6,      a0,      16   // store dif
282    move            a0,      t8
283endfunc
284
285function msac_decode_bool_equi_lsx
286    ld.w            t0,      a0,      24   // rng
287    ld.d            t1,      a0,      16   // dif
288    ld.w            a5,      a0,      28   // cnt
289    srli.w          t2,      t0,      8    // r >> 8
290    slli.w          t2,      t2,      7
291    addi.w          t2,      t2,      4    // v
292
293    slli.d          t3,      t2,      48   // vw
294    sltu            t4,      t1,      t3
295    move            t8,      t4            // ret
296    xori            t4,      t4,      1
297    maskeqz         t6,      t3,      t4   // if (ret) vw
298    sub.d           t6,      t1,      t6   // dif
299    slli.w          t5,      t2,      1
300    sub.w           t5,      t0,      t5   // r - 2v
301    maskeqz         t7,      t5,      t4   // if (ret) r - 2v
302    add.w           t5,      t2,      t7   // v(rng)
303
304    // renorm
305    clz.w           t4,      t5            // d
306    xori            t4,      t4,      16   // d
307    sll.d           t6,      t6,      t4
308    sll.w           t5,      t5,      t4
309    sub.w           t7,      a5,      t4   // cnt-d
310    st.w            t5,      a0,      24   // store rng
311    bgeu            a5,      t4,      9f
312
313    // refill
314    ld.d            t0,      a0,      0    // buf_pos
315    ld.d            t1,      a0,      8    // buf_end
316    addi.d          t2,      t0,      8
317    bltu            t1,      t2,      2f
318
319    ld.d            t3,      t0,      0    // next_bits
320    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
321    nor             t3,      t3,      t3
322    sub.w           t2,      zero,    t1
323    revb.d          t3,      t3            // next_bits = bswap(next_bits)
324    srli.w          t2,      t2,      3    // num_bytes_read
325    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
326    b               3f
3271:
328    addi.w          t3,      t7,      -48
329    srl.d           t3,      t3,      t3   // pad with ones
330    b               4f
3312:
332    bgeu            t0,      t1,      1b
333    ld.d            t3,      t1,      -8   // next_bits
334    sub.w           t2,      t2,      t1
335    sub.w           t1,      t1,      t0   // num_bytes_left
336    slli.w          t2,      t2,      3
337    srl.d           t3,      t3,      t2
338    addi.w          t2,      t7,      -48
339    nor             t3,      t3,      t3
340    sub.w           t4,      zero,    t2
341    revb.d          t3,      t3
342    srli.w          t4,      t4,      3
343    srl.d           t3,      t3,      t2
344    sltu            t2,      t1,      t4
345    maskeqz         t1,      t1,      t2
346    masknez         t2,      t4,      t2
347    or              t2,      t2,      t1   // num_bytes_read
3483:
349    slli.w          t1,      t2,      3
350    add.d           t0,      t0,      t2
351    add.w           t7,      t7,      t1   // cnt += num_bits_read
352    st.d            t0,      a0,      0
3534:
354    or              t6,      t6,      t3   // dif |= next_bits
3559:
356    st.w            t7,      a0,      28   // store cnt
357    st.d            t6,      a0,      16   // store dif
358    move            a0,      t8
359endfunc
360
361function msac_decode_bool_adapt_lsx
362    ld.hu           a3,      a1,      0    // cdf[0] /f
363    ld.w            t0,      a0,      24   // rng
364    ld.d            t1,      a0,      16   // dif
365    srli.w          t2,      t0,      8    // r >> 8
366    srli.w          a7,      a3,      6
367    mul.w           t2,      t2,      a7
368    ld.w            a4,      a0,      32   // allow_update_cdf
369    ld.w            a5,      a0,      28   // cnt
370    srli.w          t2,      t2,      1
371    addi.w          t2,      t2,      4    // v
372    slli.d          t3,      t2,      48   // vw
373    sltu            t4,      t1,      t3
374    move            t8,      t4            // bit
375    xori            t4,      t4,      1
376    maskeqz         t6,      t3,      t4   // if (ret) vw
377    sub.d           t6,      t1,      t6   // dif
378    slli.w          t5,      t2,      1
379    sub.w           t5,      t0,      t5   // r - 2v
380    maskeqz         t7,      t5,      t4   // if (ret) r - 2v
381    add.w           t5,      t2,      t7   // v(rng)
382    beqz            a4,      .renorm
383
384    // update_cdf
385    ld.hu           t0,      a1,      2    // cdf[1]
386    srli.w          t1,      t0,      4
387    addi.w          t1,      t1,      4    // rate
388    sltui           t2,      t0,      32   // count < 32
389    add.w           t0,      t0,      t2   // count + (count < 32)
390    sub.w           a3,      a3,      t8   // cdf[0] -= bit
391    slli.w          t4,      t8,      15
392    sub.w           t7,      a3,      t4   // cdf[0] - bit - 32768
393    sra.w           t7,      t7,      t1   // (cdf[0] - bit - 32768) >> rate
394    sub.w           t7,      a3,      t7   // cdf[0]
395    st.h            t7,      a1,      0
396    st.h            t0,      a1,      2
397
398.renorm:
399    clz.w           t4,      t5            // d
400    xori            t4,      t4,      16   // d
401    sll.d           t6,      t6,      t4
402    sll.w           t5,      t5,      t4
403    sub.w           t7,      a5,      t4   // cnt-d
404    st.w            t5,      a0,      24   // store rng
405    bgeu            a5,      t4,      9f
406
407    // refill
408    ld.d            t0,      a0,      0    // buf_pos
409    ld.d            t1,      a0,      8    // buf_end
410    addi.d          t2,      t0,      8
411    bltu            t1,      t2,      2f
412
413    ld.d            t3,      t0,      0    // next_bits
414    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
415    nor             t3,      t3,      t3
416    sub.w           t2,      zero,    t1
417    revb.d          t3,      t3            // next_bits = bswap(next_bits)
418    srli.w          t2,      t2,      3    // num_bytes_read
419    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
420    b               3f
4211:
422    addi.w          t3,      t7,      -48
423    srl.d           t3,      t3,      t3   // pad with ones
424    b               4f
4252:
426    bgeu            t0,      t1,      1b
427    ld.d            t3,      t1,      -8   // next_bits
428    sub.w           t2,      t2,      t1
429    sub.w           t1,      t1,      t0   // num_bytes_left
430    slli.w          t2,      t2,      3
431    srl.d           t3,      t3,      t2
432    addi.w          t2,      t7,      -48
433    nor             t3,      t3,      t3
434    sub.w           t4,      zero,    t2
435    revb.d          t3,      t3
436    srli.w          t4,      t4,      3
437    srl.d           t3,      t3,      t2
438    sltu            t2,      t1,      t4
439    maskeqz         t1,      t1,      t2
440    masknez         t2,      t4,      t2
441    or              t2,      t2,      t1   // num_bytes_read
4423:
443    slli.w          t1,      t2,      3
444    add.d           t0,      t0,      t2
445    add.w           t7,      t7,      t1   // cnt += num_bits_read
446    st.d            t0,      a0,      0
4474:
448    or              t6,      t6,      t3   // dif |= next_bits
4499:
450    st.w            t7,      a0,      28   // store cnt
451    st.d            t6,      a0,      16   // store dif
452    move            a0,      t8
453endfunc
454
455.macro HI_TOK allow_update_cdf
456.\allow_update_cdf\()_hi_tok_lsx_start:
457.if \allow_update_cdf == 1
458    ld.hu        a4,    a1,    0x06 // cdf[3]
459.endif
460    vor.v        vr1,   vr0,   vr0
461    vsrli.h      vr1,   vr1,   0x06 // cdf[val] >> EC_PROB_SHIFT
462    vstelm.h     vr2,   sp,    0, 0 // -0x1a
463    vand.v       vr2,   vr2,   vr4  // (8 x rng) & 0xff00
464    vslli.h      vr1,   vr1,   0x07
465    vmuh.hu      vr1,   vr1,   vr2
466    vadd.h       vr1,   vr1,   vr5 // v += EC_MIN_PROB/* 4 */ * ((unsigned)n_symbols/* 3 */ - val);
467    vst          vr1,   sp,    0x02 // -0x18
468    vssub.hu     vr1,   vr1,   vr3 // v - c
469    vseqi.h      vr1,   vr1,   0
470.if \allow_update_cdf == 1
471    addi.d       t4,    a4,    0x50
472    srli.d       t4,    t4,    0x04
473    sltui        t7,    a4,    32
474    add.w        a4,    a4,    t7
475
476    vreplgr2vr.h vr7,   t4
477    vavgr.hu     vr9,   vr8,   vr1
478    vsub.h       vr9,   vr9,   vr0
479    vsub.h       vr0,   vr0,   vr1
480    vsra.h       vr9,   vr9,   vr7
481    vadd.h       vr0,   vr0,   vr9
482    vstelm.d     vr0,   a1,    0,  0
483    st.h         a4,    a1,    0x06
484.endif
485    vmsknz.b     vr7,   vr1
486    movfr2gr.s   t4,    f7
487    ctz.w        t4,    t4 // loop_times * 2
488    addi.d       t7,    t4,    2
489    ldx.hu       t6,    sp,    t4  // u
490    ldx.hu       t5,    sp,    t7  // v
491    addi.w       t3,    t3,    0x05
492    addi.w       t4,    t4,   -0x05 // if t4 == 3, continue
493    sub.w        t6,    t6,    t5   // u - v , rng for ctx_norm
494    slli.d       t5,    t5,    0x30 //  (ec_win)v << (EC_WIN_SIZE - 16)
495    sub.d        t1,    t1,    t5   //  s->dif - ((ec_win)v << (EC_WIN_SIZE - 16))
496    // Init ctx_norm  param
497    clz.w        t7,    t6
498    xori         t7,    t7,    0x1f
499    xori         t7,    t7,    0x0f //  d = 15 ^ (31 ^ clz(rng));
500    sll.d        t1,    t1,    t7   //  dif << d
501    sll.d        t6,    t6,    t7   //  rng << d
502    // update vr2 8 x rng
503    vreplgr2vr.h vr2,   t6
504    vreplvei.h   vr2,   vr2,   0
505    st.w         t6,    a0,    0x18 // store rng
506    move         t0,    t2
507    sub.w        t2,    t2,    t7   // cnt - d
508    bgeu         t0,    t7,    .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end     // if ((unsigned)cnt < (unsigned)d)  goto ctx_norm_end
509    // Step into ctx_fill
510    ld.d         t5,    a0,    0x00 // buf_pos
511    ld.d         t6,    a0,    0x08 // end_pos
512    addi.d       t7,    t5,    0x08 // buf_pos + 8
513    sub.d        t7,    t7,    t6   // (buf_pos + 8) - end_pos
514    blt          zero,  t7,    .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob
515    // (end_pos - buf_pos) >= 8
516    ld.d         t6,    t5,    0x00 // load buf_pos[0]~buf_pos[7]
517    addi.w       t7,    t2,   -0x30 // cnt - 0x30
518    nor          t6,    t6,    t6   // not buf data
519    revb.d       t6,    t6          // Byte reversal
520    srl.d        t6,    t6,    t7   // Replace left shift with right shift
521    sub.w        t7,    zero,  t7   // neg
522    srli.w       t7,    t7,    0x03 // Loop times
523    or           t1,    t1,    t6   // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c
524    b            .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end
525.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob:
526    bge          t5,    t6,    .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one
527    // end_pos - buf_pos < 8 && buf_pos < end_pos
528    ld.d         t0,    t6,   -0x08
529    slli.d       t7,    t7,    0x03
530    srl.d        t6,    t0,    t7   // Retrieve the buf data and remove the excess data
531    addi.w       t7,    t2,   -0x30 // cnt - 0x30
532    nor          t6,    t6,    t6   // not
533    revb.d       t6,    t6          // Byte reversal
534    srl.d        t6,    t6,    t7   // Replace left shift with right shift
535    sub.w        t7,    zero,  t7   // neg
536    or           t1,    t1,    t6   // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c
537    ld.d         t6,    a0,    0x08 // end_pos
538    srli.w       t7,    t7,    0x03 // Loop times
539    sub.d        t6,    t6,    t5   // end_pos - buf_pos
540    slt          t0,    t6,    t7
541    maskeqz      a3,    t6,    t0   // min(loop_times, end_pos - buf_pos)
542    masknez      t0,    t7,    t0
543    or           t7,    a3,    t0
544    b            .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end
545.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one:
546    // buf_pos >= end_pos
547    addi.w       t7,    t2,   -0x10
548    andi         t7,    t7,    0xf
549    nor          t0,    zero,  zero
550    srl.d        t0,    t0,    t7
551    or           t1,    t1,    t0 // dif |= ~(~(ec_win)0xff << c);
552    b            .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end
553.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end:
554    add.d        t5,    t5,    t7        // buf_pos + Loop_times
555    st.d         t5,    a0,    0x00      // Store buf_pos
556    alsl.w       t2,    t7,    t2,  0x03 // update cnt
557.\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end:
558    srli.d       t7,    t1,    0x30
559    vreplgr2vr.h vr3,   t7        // broadcast the high 16 bits of dif
560    add.w        t3,    t4,    t3 // update control parameter
561    beqz         t3,    .\allow_update_cdf\()_hi_tok_lsx_end // control loop for at most 4 times.
562    blt          zero,  t4,    .\allow_update_cdf\()_hi_tok_lsx_start // tok_br == 3
563.\allow_update_cdf\()_hi_tok_lsx_end:
564    addi.d       t3,    t3,    0x1e
565    st.d         t1,    a0,    0x10 // store dif
566    st.w         t2,    a0,    0x1c // store cnt
567    srli.w       a0,    t3,    0x01 // tok
568    addi.d       sp,    sp,    0x1a
569.endm
570
571/**
572 * @param unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf)
573 * * Reg Alloction
574 * * vr0: cdf;
575 * * vr1: temp;
576 * * vr2: rng;
577 * * vr3: dif;
578 * * vr4: const 0xff00ff00...ff00ff00;
579 * * vr5: const 0x0004080c;
580 * * vr6: const 0;
581 * * t0: allow_update_cdf, tmp;
582 * * t1: dif;
583 * * t2: cnt;
584 * * t3: 0xffffffe8, outermost control parameter;
585 * * t4: loop time
586 * * t5: v, buf_pos, temp;
587 * * t6: u, rng, end_pos, buf, temp;
588 * * t7: temp;
589 */
590function msac_decode_hi_tok_lsx
591    fld.d     f0,    a1,   0    // Load cdf[0]~cdf[3]
592    vldrepl.h vr2,   a0,   0x18 //  8 x rng, assert(rng <= 65535U), only the lower 16 bits are valid
593    vldrepl.h vr3,   a0,   0x16 // broadcast the high 16 bits of dif, c = s->dif >> (EC_WIN_SIZE - 16)
594    ld.w      t0,    a0,   0x20 // allow_update_cdf
595    la.local  t7,    ph_0xff00
596    vld       vr4,   t7,   0x00 // 0xff00ff00...ff00ff00
597    la.local  t7,    min_prob
598    vld       vr5,   t7,   12 * 2 // 0x0004080c
599    vxor.v    vr6,   vr6,  vr6    // const 0
600    ld.d      t1,    a0,   0x10   // dif
601    ld.w      t2,    a0,   0x1c   // cnt
602    orn       t3,    t3,   t3
603    srli.d    t3,    t3,   32
604    addi.d    t3,    t3,  -0x17 // 0xffffffe8
605    vseq.h    vr8,   vr8,  vr8
606    addi.d    sp,    sp,  -0x1a // alloc stack
607    beqz      t0,    .hi_tok_lsx_no_update_cdf
608    HI_TOK 1
609    jirl      zero,  ra,   0x0
610.hi_tok_lsx_no_update_cdf:
611    HI_TOK 0
612endfunc
613