xref: /aosp_15_r20/external/libdav1d/src/arm/64/msac.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2019, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31#define BUF_POS 0
32#define BUF_END 8
33#define DIF 16
34#define RNG 24
35#define CNT 28
36#define ALLOW_UPDATE_CDF 32
37
38#define COEFFS_BASE_OFFSET 30
39#define MASKS8_OFFSET (64-COEFFS_BASE_OFFSET)
40
41const coeffs
42        .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
43        .short 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0
44        // masks8
45        .short -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, 0xF0E
46endconst
47
48.macro ld1_n d0, d1, src, sz, n
49.if \n <= 8
50        ld1             {\d0\sz},  [\src]
51.else
52        ld1             {\d0\sz, \d1\sz},  [\src]
53.endif
54.endm
55
56.macro st1_n s0, s1, dst, sz, n
57.if \n <= 8
58        st1             {\s0\sz},  [\dst]
59.else
60        st1             {\s0\sz, \s1\sz},  [\dst]
61.endif
62.endm
63
64.macro ushr_n d0, d1, s0, s1, shift, sz, n
65        ushr            \d0\sz,  \s0\sz,  \shift
66.if \n == 16
67        ushr            \d1\sz,  \s1\sz,  \shift
68.endif
69.endm
70
71.macro add_n d0, d1, s0, s1, s2, s3, sz, n
72        add             \d0\sz,  \s0\sz,  \s2\sz
73.if \n == 16
74        add             \d1\sz,  \s1\sz,  \s3\sz
75.endif
76.endm
77
78.macro sub_n d0, d1, s0, s1, s2, s3, sz, n
79        sub             \d0\sz,  \s0\sz,  \s2\sz
80.if \n == 16
81        sub             \d1\sz,  \s1\sz,  \s3\sz
82.endif
83.endm
84
85.macro and_n d0, d1, s0, s1, s2, s3, sz, n
86        and             \d0\sz,  \s0\sz,  \s2\sz
87.if \n == 16
88        and             \d1\sz,  \s1\sz,  \s3\sz
89.endif
90.endm
91
92.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n
93        cmhs            \d0\sz,  \s0\sz,  \s2\sz
94.if \n == 16
95        cmhs            \d1\sz,  \s1\sz,  \s3\sz
96.endif
97.endm
98
99.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
100        sshl            \d0\sz,  \s0\sz,  \s2\sz
101.if \n == 16
102        sshl            \d1\sz,  \s1\sz,  \s3\sz
103.endif
104.endm
105
106.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
107        sqdmulh         \d0\sz,  \s0\sz,  \s2\sz
108.if \n == 16
109        sqdmulh         \d1\sz,  \s1\sz,  \s3\sz
110.endif
111.endm
112
113.macro str_n            idx0, idx1, dstreg, dstoff, n
114        str             \idx0,  [\dstreg, \dstoff]
115.if \n == 16
116        str             \idx1,  [\dstreg, \dstoff + 16]
117.endif
118.endm
119
120// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
121//                                               size_t n_symbols);
122
123function msac_decode_symbol_adapt4_neon, export=1
124.macro decode_update sz, szb, n
125.if \n == 16
126        sub             sp,  sp,  #48
127.endif
128        add             x8,  x0,  #RNG
129        ld1_n           v0,  v1,  x1,  \sz, \n                    // cdf
130        ld1r            {v29\sz}, [x8]                            // rng
131        movrel          x9,  coeffs, COEFFS_BASE_OFFSET
132        movi            v31\sz, #0x7f, lsl #8                     // 0x7f00
133        sub             x10, x9,  x2, lsl #1
134        mvni            v30\sz, #0x3f                             // 0xffc0
135        and             v7\szb, v29\szb, v31\szb                  // rng & 0x7f00
136.if \n == 16
137        str             h29, [sp, #14]                            // store original u = s->rng
138.endif
139        and_n           v2,  v3,  v0,  v1,  v30, v30, \szb, \n    // cdf & 0xffc0
140
141        ld1_n           v4,  v5,  x10, \sz, \n                    // EC_MIN_PROB * (n_symbols - ret)
142        sqdmulh_n       v6,  v7,  v2,  v3,  v7,  v7,  \sz, \n     // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
143        ldr             d28, [x0, #DIF]
144
145        add_n           v4,  v5,  v2,  v3,  v4,  v5,  \sz, \n     // v = cdf + EC_MIN_PROB * (n_symbols - ret)
146        add_n           v4,  v5,  v6,  v7,  v4,  v5,  \sz, \n     // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
147
148        dup             v30\sz, v28.h[3]                          // dif >> (EC_WIN_SIZE - 16)
149.if \n == 8
150        ldur            q31, [x9, #MASKS8_OFFSET]
151.elseif \n == 16
152        str_n           q4,  q5,  sp, #16, \n                     // store v values to allow indexed access
153.endif
154
155        // After the condition starts being true it continues, such that the vector looks like:
156        //   0, 0, 0 ... -1, -1
157        cmhs_n          v2,  v3,  v30, v30, v4,  v5,  \sz,  \n    // c >= v
158.if \n == 4
159        ext             v29\szb, v29\szb, v4\szb, #6              // u
160        umov            x15, v2.d[0]
161        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
162        rev             x15, x15
163        sub             v29\sz, v29\sz, v4\sz                     // rng = u-v
164        // rev + clz = count trailing zeros
165        clz             x15, x15                                  // 16*ret
166.elseif \n == 8
167        // The final short of the compare is always set.
168        // Using addv, subtract -0x202*ret from this value to create a lookup table for a short.
169        //  For n == 8:
170        // -0x202 + -0x202 + ... + 0xF0E
171        //                    (0x202*7) | (1 << 8)
172        //                                    ^-------offset for second byte of the short
173        and             v31\szb, v31\szb, v2\szb
174        ext             v29\szb, v29\szb, v4\szb, #14             // u
175        addv            h31, v31\sz                               // ((2*ret + 1) << 8) | (2*ret)
176        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
177        sub             v30\sz, v30\sz, v4\sz                     // (dif >> 48) - v
178        smov            w15, v31.b[0]                             // 2*ret
179        sub             v29\sz, v29\sz, v4\sz                     // rng = u-v
180.elseif \n == 16
181        add             v6\sz,  v2\sz,  v3\sz
182        addv            h31, v6\sz                                // -n + ret
183        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
184        smov            w15, v31.h[0]
185.endif
186
187        cbz             w4,  0f
188
189        // update_cdf
190        ldrh            w3,  [x1, x2, lsl #1]                     // count = cdf[n_symbols]
191.if \n == 16
192        // 16 case has a lower bound that guarantees n_symbols > 2
193        mov             w4,  #-5
194.elseif \n == 8
195        mvn             w14, w2
196        mov             w4,  #-4
197        cmn             w14, #3                                   // set C if n_symbols <= 2
198.else
199        // if n_symbols < 4 (or < 6 even) then
200        //   (1 + n_symbols) >> 2 == n_symbols > 2
201        add             w14, w2,  #17                             // (1 + n_symbols) + (4 << 2)
202.endif
203        sub_n           v16, v17, v0,  v1,  v2,  v3,  \sz, \n     // cdf + (i >= val ? 1 : 0)
204        orr             v2\sz, #0x80, lsl #8
205.if \n == 16
206        orr             v3\sz, #0x80, lsl #8
207.endif
208.if \n == 16
209        sub             w4,  w4,  w3, lsr #4                      // -((count >> 4) + 5)
210.elseif \n == 8
211        lsr             w14, w3,  #4                              // count >> 4
212        sbc             w4,  w4,  w14                             // -((count >> 4) + (n_symbols > 2) + 4)
213.else
214        neg             w4, w14, lsr #2                           // -((n_symbols > 2) + 4)
215        sub             w4,  w4,  w3,  lsr #4                     // -((count >> 4) + (n_symbols > 2) + 4)
216.endif
217        sub_n           v2,  v3,  v2,  v3,  v0,  v1,  \sz, \n     // (32768 - cdf[i]) or (-1 - cdf[i])
218        dup             v6\sz,    w4                              // -rate
219
220        sub             w3,  w3,  w3, lsr #5                      // count - (count == 32)
221        sshl_n          v2,  v3,  v2,  v3,  v6,  v6,  \sz, \n     // ({32768,-1} - cdf[i]) >> rate
222        add             w3,  w3,  #1                              // count + (count < 32)
223        add_n           v0,  v1,  v16, v17, v2,  v3,  \sz, \n     // cdf + (32768 - cdf[i]) >> rate
224        st1_n           v0,  v1,  x1,  \sz, \n
225        strh            w3,  [x1, x2, lsl #1]
226
2270:
228        // renorm
229.if \n == 4
230        ldr             w6,  [x0, #CNT]
231        ldr             x7,  [x0, #DIF]
232        mov             x4,  v29.d[0]          // rng (packed)
233        mov             x3,  v4.d[0]           // v (packed)
234
235        // Shift 'v'/'rng' for ret into the 16 least sig bits. There is
236        //  garbage in the remaining bits, but we can work around this.
237        lsr             x4,  x4,  x15          // rng
238        lsr             x3,  x3,  x15          // v
239        lsl             w5,  w4,  #16          // rng << 16
240        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
241        clz             w5,  w5                // d = clz(rng << 16)
242        lsl             w4,  w4,  w5           // rng << d
243        subs            w6,  w6,  w5           // cnt -= d
244        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
245        strh            w4,  [x0, #RNG]
246        b.lo            1f
247        str             w6,  [x0, #CNT]
248        str             x7,  [x0, #DIF]
249        lsr             w0,  w15, #4
250        ret
2511:
252        lsr             w15, w15, #4
253        b L(refill)
254.elseif \n == 8
255        ldr             w6,  [x0, #CNT]
256        tbl             v30.8b, {v30.16b}, v31.8b
257        tbl             v29.8b, {v29.16b}, v31.8b
258        ins             v28.h[3], v30.h[0]     // dif - (v << 48)
259        clz             v0.4h,  v29.4h         // d = clz(rng)
260        umov            w5,  v0.h[0]
261        ushl            v29.4h, v29.4h, v0.4h  // rng << d
262
263        // The vec for clz(rng) is filled with garbage after the first short,
264        //  but ushl/sshl conveniently uses only the first byte for the shift
265        //  amount.
266        ushl            d28, d28, d0           // (dif - (v << 48)) << d
267
268        subs            w6,  w6,  w5           // cnt -= d
269        str             h29, [x0, #RNG]
270        b.lo            1f
271        str             w6,  [x0, #CNT]
272        str             d28, [x0, #DIF]
273        lsr             w0,  w15, #1           // ret
274        ret
2751:
276        lsr             w15, w15, #1           // ret
277        mov             x7, v28.d[0]
278        b L(refill)
279.elseif \n == 16
280        add             x8,  sp,  w15, sxtw #1
281        ldrh            w3,  [x8, #48]         // v
282        ldurh           w4,  [x8, #46]         // u
283        ldr             w6,  [x0, #CNT]
284        ldr             x7,  [x0, #DIF]
285        sub             w4,  w4,  w3           // rng = u - v
286        clz             w5,  w4                // clz(rng)
287        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
288        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
289        lsl             w4,  w4,  w5           // rng << d
290        subs            w6,  w6,  w5           // cnt -= d
291        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
292        str             w4,  [x0, #RNG]
293        add             sp,  sp,  #48
294        b.lo            1f
295        str             w6,  [x0, #CNT]
296        str             x7,  [x0, #DIF]
297        add             w0,  w15, #\n          // ret
298        ret
2991:
300        add             w15, w15, #\n          // ret
301        b L(refill)
302.endif
303.endm
304
305        decode_update   .4h, .8b, 4
306
307L(refill):
308        // refill
309        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
310        add             x5,  x3,  #8
311        subs            x5,  x5,  x4
312        b.hi            6f
313
314        ldr             x8,  [x3]              // next_bits
315        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)
316        mvn             x8,  x8
317        neg             w5,  w4
318        rev             x8,  x8                // next_bits = bswap(next_bits)
319        lsr             w5,  w5,  #3           // num_bytes_read
320        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)
321
3222:      // refill_end
323        add             x3,  x3,  x5
324        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read
325        str             x3,  [x0, #BUF_POS]
326
3273:      // refill_end2
328        orr             x7,  x7,  x8           // dif |= next_bits
329
3304:      // end
331        str             w6,  [x0, #CNT]
332        str             x7,  [x0, #DIF]
333
334        mov             w0,  w15
335        ret
336
3375:      // pad_with_ones
338        add             w8,  w6,  #-16
339        ror             x8,  x8,  x8
340        b               3b
341
3426:      // refill_eob
343        cmp             x3,  x4
344        b.hs            5b
345
346        ldr             x8,  [x4, #-8]
347        lsl             w5,  w5,  #3
348        lsr             x8,  x8,  x5
349        add             w5,  w6,  #-48
350        mvn             x8,  x8
351        sub             w4,  w4,  w3           // num_bytes_left
352        rev             x8,  x8
353        lsr             x8,  x8,  x5
354        neg             w5,  w5
355        lsr             w5,  w5,  #3
356        cmp             w5,  w4
357        csel            w5,  w5,  w4,  lo      // num_bytes_read
358        b               2b
359endfunc
360
361function msac_decode_symbol_adapt8_neon, export=1
362        decode_update   .8h, .16b, 8
363endfunc
364
365function msac_decode_symbol_adapt16_neon, export=1
366        decode_update   .8h, .16b, 16
367endfunc
368
369function msac_decode_hi_tok_neon, export=1
370        ld1             {v0.4h},  [x1]            // cdf
371        add             x16, x0,  #RNG
372        movi            v31.4h, #0x7f, lsl #8     // 0x7f00
373        movrel          x17, coeffs, COEFFS_BASE_OFFSET-2*3
374        mvni            v30.4h, #0x3f             // 0xffc0
375        ldrh            w9,  [x1, #6]             // count = cdf[n_symbols]
376        ld1r            {v3.4h},  [x16]           // rng
377        ld1             {v29.4h}, [x17]           // EC_MIN_PROB * (n_symbols - ret)
378        add             x17, x0,  #DIF + 6
379        mov             w13, #-24*8
380        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
381        ldr             w10, [x0, #ALLOW_UPDATE_CDF]
382        ld1r            {v1.8h},  [x17]           // dif >> (EC_WIN_SIZE - 16)
383        ldr             w6,  [x0, #CNT]
384        ldr             x7,  [x0, #DIF]
3851:
386        and             v7.8b,   v3.8b,   v31.8b  // rng & 0x7f00
387        sqdmulh         v6.4h,   v17.4h,  v7.4h   // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
388        add             v4.4h,   v17.4h,  v29.4h  // v = cdf + EC_MIN_PROB * (n_symbols - ret)
389        add             v4.4h,   v6.4h,   v4.4h   // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
390        cmhs            v2.4h,   v1.4h,   v4.4h   // c >= v
391        add             w13, w13, #5*8
392        ext             v18.8b, v3.8b,  v4.8b, #6 // u
393        umov            x15, v2.d[0]
394        rev             x15, x15
395        sub             v18.4h, v18.4h, v4.4h     // rng = u-v
396        // rev + clz = count trailing zeros
397        clz             x15, x15                  // 16*ret
398
399        cbz             w10, 2f
400        // update_cdf
401        sub             v5.4h,   v0.4h,   v2.4h   // cdf[i] + (i >= val ? 1 : 0)
402        mov             w4,  #-5
403        orr             v2.4h, #0x80, lsl #8      // i >= val ? -1 : 32768
404        sub             w4,  w4,  w9, lsr #4      // -((count >> 4) + 5)
405        sub             v2.4h,   v2.4h,   v0.4h   // (32768 - cdf[i]) or (-1 - cdf[i])
406        dup             v6.4h,    w4              // -rate
407
408        sub             w9,  w9,  w9, lsr #5      // count - (count == 32)
409        sshl            v2.4h,   v2.4h,   v6.4h   // ({32768,-1} - cdf[i]) >> rate
410        add             w9,  w9,  #1              // count + (count < 32)
411        add             v0.4h,   v5.4h,   v2.4h   // cdf[i] + (32768 - cdf[i]) >> rate
412        st1             {v0.4h},  [x1]
413        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
414        strh            w9,  [x1, #6]
415
4162:
417        mov             x4,  v18.d[0]          // rng (packed)
418        mov             x3,  v4.d[0]           // v (packed)
419
420        // Shift 'v'/'rng' for ret into the 16 least sig bits. There is
421        //  garbage in the remaining bits, but we can work around this.
422        lsr             x4,  x4,  x15          // rng
423        lsr             x3,  x3,  x15          // v
424        lsl             w5,  w4,  #16          // rng << 16
425        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
426        clz             w5,  w5                // d = clz(rng << 16)
427        lsl             w4,  w4,  w5           // rng << d
428        subs            w6,  w6,  w5           // cnt -= d
429        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
430        strh            w4,  [x0, #RNG]
431        dup             v3.4h,   w4
432        b.hs            5f
433
434        // refill
435        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
436        add             x5,  x3,  #8
437        subs            x5,  x5,  x4
438        b.hi            7f
439
440        ldr             x8,  [x3]              // next_bits
441        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)
442        mvn             x8,  x8
443        neg             w5,  w4
444        rev             x8,  x8                // next_bits = bswap(next_bits)
445        lsr             w5,  w5,  #3           // num_bytes_read
446        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)
447
4483:      // refill_end
449        add             x3,  x3,  x5
450        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read
451        str             x3,  [x0, #BUF_POS]
452
4534:      // refill_end2
454        orr             x7,  x7,  x8           // dif |= next_bits
455
4565:      // end
457        sub             w15, w15, #5*8
458        lsr             x12, x7,  #48
459        adds            w13, w13, w15          // carry = tok_br < 3 || tok == 15
460        dup             v1.8h,   w12
461        b.cc            1b                     // loop if !carry
462        add             w13, w13, #30*8
463        str             w6,  [x0, #CNT]
464        str             x7,  [x0, #DIF]
465        lsr             w0,  w13, #4
466        ret
467
4686:      // pad_with_ones
469        add             w8,  w6,  #-16
470        ror             x8,  x8,  x8
471        b               4b
472
4737:      // refill_eob
474        cmp             x3,  x4
475        b.hs            6b
476
477        ldr             x8,  [x4, #-8]
478        lsl             w5,  w5,  #3
479        lsr             x8,  x8,  x5
480        add             w5,  w6,  #-48
481        mvn             x8,  x8
482        sub             w4,  w4,  w3           // num_bytes_left
483        rev             x8,  x8
484        lsr             x8,  x8,  x5
485        neg             w5,  w5
486        lsr             w5,  w5,  #3
487        cmp             w5,  w4
488        csel            w5,  w5,  w4,  lo      // num_bytes_read
489        b               3b
490endfunc
491
492function msac_decode_bool_equi_neon, export=1
493        ldp             w5,  w6,  [x0, #RNG]   // + CNT
494        ldr             x7,  [x0, #DIF]
495        bic             w4,  w5,  #0xff        // r &= 0xff00
496        add             w4,  w4,  #8
497        subs            x8,  x7,  x4, lsl #47  // dif - vw
498        lsr             w4,  w4,  #1           // v
499        sub             w5,  w5,  w4           // r - v
500        cset            w15, lo
501        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
502        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
503
504        clz             w5,  w4                // clz(rng)
505        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
506        lsl             w4,  w4,  w5           // rng << d
507        subs            w6,  w6,  w5           // cnt -= d
508        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
509        str             w4,  [x0, #RNG]
510        b.lo            L(refill)
511
512        str             w6,  [x0, #CNT]
513        str             x7,  [x0, #DIF]
514        mov             w0,  w15
515        ret
516endfunc
517
518function msac_decode_bool_neon, export=1
519        ldp             w5,  w6,  [x0, #RNG]   // + CNT
520        ldr             x7,  [x0, #DIF]
521        lsr             w4,  w5,  #8           // r >> 8
522        bic             w1,  w1,  #0x3f        // f &= ~63
523        mul             w4,  w4,  w1
524        lsr             w4,  w4,  #7
525        add             w4,  w4,  #4           // v
526        subs            x8,  x7,  x4, lsl #48  // dif - vw
527        sub             w5,  w5,  w4           // r - v
528        cset            w15, lo
529        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
530        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
531
532        clz             w5,  w4                // clz(rng)
533        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
534        lsl             w4,  w4,  w5           // rng << d
535        subs            w6,  w6,  w5           // cnt -= d
536        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
537        str             w4,  [x0, #RNG]
538        b.lo            L(refill)
539
540        str             w6,  [x0, #CNT]
541        str             x7,  [x0, #DIF]
542        mov             w0,  w15
543        ret
544endfunc
545
546function msac_decode_bool_adapt_neon, export=1
547        ldr             w9,  [x1]              // cdf[0-1]
548        ldp             w5,  w6,  [x0, #RNG]   // + CNT
549        ldr             x7,  [x0, #DIF]
550        lsr             w4,  w5,  #8           // r >> 8
551        and             w2,  w9,  #0xffc0      // f &= ~63
552        mul             w4,  w4,  w2
553        lsr             w4,  w4,  #7
554        add             w4,  w4,  #4           // v
555        subs            x8,  x7,  x4, lsl #48  // dif - vw
556        sub             w5,  w5,  w4           // r - v
557        cset            w15, lo
558        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
559        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
560
561        ldr             w10, [x0, #ALLOW_UPDATE_CDF]
562
563        clz             w5,  w4                // clz(rng)
564        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
565
566        cbz             w10, 1f
567
568        lsr             w2,  w9,  #16          // count = cdf[1]
569        and             w9,  w9,  #0xffff      // cdf[0]
570
571        sub             w3,  w2,  w2, lsr #5   // count - (count >= 32)
572        lsr             w2,  w2,  #4           // count >> 4
573        add             w10, w3,  #1           // count + (count < 32)
574        add             w2,  w2,  #4           // rate = (count >> 4) | 4
575
576        sub             w9,  w9,  w15          // cdf[0] -= bit
577        sub             w11, w9,  w15, lsl #15 // {cdf[0], cdf[0] - 32769}
578        asr             w11, w11, w2           // {cdf[0], cdf[0] - 32769} >> rate
579        sub             w9,  w9,  w11          // cdf[0]
580
581        strh            w9,  [x1]
582        strh            w10, [x1, #2]
583
5841:
585        lsl             w4,  w4,  w5           // rng << d
586        subs            w6,  w6,  w5           // cnt -= d
587        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
588        str             w4,  [x0, #RNG]
589        b.lo            L(refill)
590
591        str             w6,  [x0, #CNT]
592        str             x7,  [x0, #DIF]
593        mov             w0,  w15
594        ret
595endfunc
596