xref: /aosp_15_r20/external/libdav1d/src/arm/64/msac.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2019, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2019, Martin Storsjo
4*c0909341SAndroid Build Coastguard Worker * All rights reserved.
5*c0909341SAndroid Build Coastguard Worker *
6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker *
9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker *
12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker *
16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker */
27*c0909341SAndroid Build Coastguard Worker
28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
29*c0909341SAndroid Build Coastguard Worker#include "util.S"
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Worker#define BUF_POS 0
32*c0909341SAndroid Build Coastguard Worker#define BUF_END 8
33*c0909341SAndroid Build Coastguard Worker#define DIF 16
34*c0909341SAndroid Build Coastguard Worker#define RNG 24
35*c0909341SAndroid Build Coastguard Worker#define CNT 28
36*c0909341SAndroid Build Coastguard Worker#define ALLOW_UPDATE_CDF 32
37*c0909341SAndroid Build Coastguard Worker
38*c0909341SAndroid Build Coastguard Worker#define COEFFS_BASE_OFFSET 30
39*c0909341SAndroid Build Coastguard Worker#define MASKS8_OFFSET (64-COEFFS_BASE_OFFSET)
40*c0909341SAndroid Build Coastguard Worker
41*c0909341SAndroid Build Coastguard Workerconst coeffs
42*c0909341SAndroid Build Coastguard Worker        .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
43*c0909341SAndroid Build Coastguard Worker        .short 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0
44*c0909341SAndroid Build Coastguard Worker        // masks8
45*c0909341SAndroid Build Coastguard Worker        .short -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, 0xF0E
46*c0909341SAndroid Build Coastguard Workerendconst
47*c0909341SAndroid Build Coastguard Worker
48*c0909341SAndroid Build Coastguard Worker.macro ld1_n d0, d1, src, sz, n
49*c0909341SAndroid Build Coastguard Worker.if \n <= 8
50*c0909341SAndroid Build Coastguard Worker        ld1             {\d0\sz},  [\src]
51*c0909341SAndroid Build Coastguard Worker.else
52*c0909341SAndroid Build Coastguard Worker        ld1             {\d0\sz, \d1\sz},  [\src]
53*c0909341SAndroid Build Coastguard Worker.endif
54*c0909341SAndroid Build Coastguard Worker.endm
55*c0909341SAndroid Build Coastguard Worker
56*c0909341SAndroid Build Coastguard Worker.macro st1_n s0, s1, dst, sz, n
57*c0909341SAndroid Build Coastguard Worker.if \n <= 8
58*c0909341SAndroid Build Coastguard Worker        st1             {\s0\sz},  [\dst]
59*c0909341SAndroid Build Coastguard Worker.else
60*c0909341SAndroid Build Coastguard Worker        st1             {\s0\sz, \s1\sz},  [\dst]
61*c0909341SAndroid Build Coastguard Worker.endif
62*c0909341SAndroid Build Coastguard Worker.endm
63*c0909341SAndroid Build Coastguard Worker
64*c0909341SAndroid Build Coastguard Worker.macro ushr_n d0, d1, s0, s1, shift, sz, n
65*c0909341SAndroid Build Coastguard Worker        ushr            \d0\sz,  \s0\sz,  \shift
66*c0909341SAndroid Build Coastguard Worker.if \n == 16
67*c0909341SAndroid Build Coastguard Worker        ushr            \d1\sz,  \s1\sz,  \shift
68*c0909341SAndroid Build Coastguard Worker.endif
69*c0909341SAndroid Build Coastguard Worker.endm
70*c0909341SAndroid Build Coastguard Worker
71*c0909341SAndroid Build Coastguard Worker.macro add_n d0, d1, s0, s1, s2, s3, sz, n
72*c0909341SAndroid Build Coastguard Worker        add             \d0\sz,  \s0\sz,  \s2\sz
73*c0909341SAndroid Build Coastguard Worker.if \n == 16
74*c0909341SAndroid Build Coastguard Worker        add             \d1\sz,  \s1\sz,  \s3\sz
75*c0909341SAndroid Build Coastguard Worker.endif
76*c0909341SAndroid Build Coastguard Worker.endm
77*c0909341SAndroid Build Coastguard Worker
78*c0909341SAndroid Build Coastguard Worker.macro sub_n d0, d1, s0, s1, s2, s3, sz, n
79*c0909341SAndroid Build Coastguard Worker        sub             \d0\sz,  \s0\sz,  \s2\sz
80*c0909341SAndroid Build Coastguard Worker.if \n == 16
81*c0909341SAndroid Build Coastguard Worker        sub             \d1\sz,  \s1\sz,  \s3\sz
82*c0909341SAndroid Build Coastguard Worker.endif
83*c0909341SAndroid Build Coastguard Worker.endm
84*c0909341SAndroid Build Coastguard Worker
85*c0909341SAndroid Build Coastguard Worker.macro and_n d0, d1, s0, s1, s2, s3, sz, n
86*c0909341SAndroid Build Coastguard Worker        and             \d0\sz,  \s0\sz,  \s2\sz
87*c0909341SAndroid Build Coastguard Worker.if \n == 16
88*c0909341SAndroid Build Coastguard Worker        and             \d1\sz,  \s1\sz,  \s3\sz
89*c0909341SAndroid Build Coastguard Worker.endif
90*c0909341SAndroid Build Coastguard Worker.endm
91*c0909341SAndroid Build Coastguard Worker
92*c0909341SAndroid Build Coastguard Worker.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n
93*c0909341SAndroid Build Coastguard Worker        cmhs            \d0\sz,  \s0\sz,  \s2\sz
94*c0909341SAndroid Build Coastguard Worker.if \n == 16
95*c0909341SAndroid Build Coastguard Worker        cmhs            \d1\sz,  \s1\sz,  \s3\sz
96*c0909341SAndroid Build Coastguard Worker.endif
97*c0909341SAndroid Build Coastguard Worker.endm
98*c0909341SAndroid Build Coastguard Worker
99*c0909341SAndroid Build Coastguard Worker.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
100*c0909341SAndroid Build Coastguard Worker        sshl            \d0\sz,  \s0\sz,  \s2\sz
101*c0909341SAndroid Build Coastguard Worker.if \n == 16
102*c0909341SAndroid Build Coastguard Worker        sshl            \d1\sz,  \s1\sz,  \s3\sz
103*c0909341SAndroid Build Coastguard Worker.endif
104*c0909341SAndroid Build Coastguard Worker.endm
105*c0909341SAndroid Build Coastguard Worker
106*c0909341SAndroid Build Coastguard Worker.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
107*c0909341SAndroid Build Coastguard Worker        sqdmulh         \d0\sz,  \s0\sz,  \s2\sz
108*c0909341SAndroid Build Coastguard Worker.if \n == 16
109*c0909341SAndroid Build Coastguard Worker        sqdmulh         \d1\sz,  \s1\sz,  \s3\sz
110*c0909341SAndroid Build Coastguard Worker.endif
111*c0909341SAndroid Build Coastguard Worker.endm
112*c0909341SAndroid Build Coastguard Worker
113*c0909341SAndroid Build Coastguard Worker.macro str_n            idx0, idx1, dstreg, dstoff, n
114*c0909341SAndroid Build Coastguard Worker        str             \idx0,  [\dstreg, \dstoff]
115*c0909341SAndroid Build Coastguard Worker.if \n == 16
116*c0909341SAndroid Build Coastguard Worker        str             \idx1,  [\dstreg, \dstoff + 16]
117*c0909341SAndroid Build Coastguard Worker.endif
118*c0909341SAndroid Build Coastguard Worker.endm
119*c0909341SAndroid Build Coastguard Worker
120*c0909341SAndroid Build Coastguard Worker// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
121*c0909341SAndroid Build Coastguard Worker//                                               size_t n_symbols);
122*c0909341SAndroid Build Coastguard Worker
123*c0909341SAndroid Build Coastguard Workerfunction msac_decode_symbol_adapt4_neon, export=1
124*c0909341SAndroid Build Coastguard Worker.macro decode_update sz, szb, n
125*c0909341SAndroid Build Coastguard Worker.if \n == 16
126*c0909341SAndroid Build Coastguard Worker        sub             sp,  sp,  #48
127*c0909341SAndroid Build Coastguard Worker.endif
128*c0909341SAndroid Build Coastguard Worker        add             x8,  x0,  #RNG
129*c0909341SAndroid Build Coastguard Worker        ld1_n           v0,  v1,  x1,  \sz, \n                    // cdf
130*c0909341SAndroid Build Coastguard Worker        ld1r            {v29\sz}, [x8]                            // rng
131*c0909341SAndroid Build Coastguard Worker        movrel          x9,  coeffs, COEFFS_BASE_OFFSET
132*c0909341SAndroid Build Coastguard Worker        movi            v31\sz, #0x7f, lsl #8                     // 0x7f00
133*c0909341SAndroid Build Coastguard Worker        sub             x10, x9,  x2, lsl #1
134*c0909341SAndroid Build Coastguard Worker        mvni            v30\sz, #0x3f                             // 0xffc0
135*c0909341SAndroid Build Coastguard Worker        and             v7\szb, v29\szb, v31\szb                  // rng & 0x7f00
136*c0909341SAndroid Build Coastguard Worker.if \n == 16
137*c0909341SAndroid Build Coastguard Worker        str             h29, [sp, #14]                            // store original u = s->rng
138*c0909341SAndroid Build Coastguard Worker.endif
139*c0909341SAndroid Build Coastguard Worker        and_n           v2,  v3,  v0,  v1,  v30, v30, \szb, \n    // cdf & 0xffc0
140*c0909341SAndroid Build Coastguard Worker
141*c0909341SAndroid Build Coastguard Worker        ld1_n           v4,  v5,  x10, \sz, \n                    // EC_MIN_PROB * (n_symbols - ret)
142*c0909341SAndroid Build Coastguard Worker        sqdmulh_n       v6,  v7,  v2,  v3,  v7,  v7,  \sz, \n     // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
143*c0909341SAndroid Build Coastguard Worker        ldr             d28, [x0, #DIF]
144*c0909341SAndroid Build Coastguard Worker
145*c0909341SAndroid Build Coastguard Worker        add_n           v4,  v5,  v2,  v3,  v4,  v5,  \sz, \n     // v = cdf + EC_MIN_PROB * (n_symbols - ret)
146*c0909341SAndroid Build Coastguard Worker        add_n           v4,  v5,  v6,  v7,  v4,  v5,  \sz, \n     // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
147*c0909341SAndroid Build Coastguard Worker
148*c0909341SAndroid Build Coastguard Worker        dup             v30\sz, v28.h[3]                          // dif >> (EC_WIN_SIZE - 16)
149*c0909341SAndroid Build Coastguard Worker.if \n == 8
150*c0909341SAndroid Build Coastguard Worker        ldur            q31, [x9, #MASKS8_OFFSET]
151*c0909341SAndroid Build Coastguard Worker.elseif \n == 16
152*c0909341SAndroid Build Coastguard Worker        str_n           q4,  q5,  sp, #16, \n                     // store v values to allow indexed access
153*c0909341SAndroid Build Coastguard Worker.endif
154*c0909341SAndroid Build Coastguard Worker
155*c0909341SAndroid Build Coastguard Worker        // After the condition starts being true it continues, such that the vector looks like:
156*c0909341SAndroid Build Coastguard Worker        //   0, 0, 0 ... -1, -1
157*c0909341SAndroid Build Coastguard Worker        cmhs_n          v2,  v3,  v30, v30, v4,  v5,  \sz,  \n    // c >= v
158*c0909341SAndroid Build Coastguard Worker.if \n == 4
159*c0909341SAndroid Build Coastguard Worker        ext             v29\szb, v29\szb, v4\szb, #6              // u
160*c0909341SAndroid Build Coastguard Worker        umov            x15, v2.d[0]
161*c0909341SAndroid Build Coastguard Worker        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
162*c0909341SAndroid Build Coastguard Worker        rev             x15, x15
163*c0909341SAndroid Build Coastguard Worker        sub             v29\sz, v29\sz, v4\sz                     // rng = u-v
164*c0909341SAndroid Build Coastguard Worker        // rev + clz = count trailing zeros
165*c0909341SAndroid Build Coastguard Worker        clz             x15, x15                                  // 16*ret
166*c0909341SAndroid Build Coastguard Worker.elseif \n == 8
167*c0909341SAndroid Build Coastguard Worker        // The final short of the compare is always set.
168*c0909341SAndroid Build Coastguard Worker        // Using addv, subtract -0x202*ret from this value to create a lookup table for a short.
169*c0909341SAndroid Build Coastguard Worker        //  For n == 8:
170*c0909341SAndroid Build Coastguard Worker        // -0x202 + -0x202 + ... + 0xF0E
171*c0909341SAndroid Build Coastguard Worker        //                    (0x202*7) | (1 << 8)
172*c0909341SAndroid Build Coastguard Worker        //                                    ^-------offset for second byte of the short
173*c0909341SAndroid Build Coastguard Worker        and             v31\szb, v31\szb, v2\szb
174*c0909341SAndroid Build Coastguard Worker        ext             v29\szb, v29\szb, v4\szb, #14             // u
175*c0909341SAndroid Build Coastguard Worker        addv            h31, v31\sz                               // ((2*ret + 1) << 8) | (2*ret)
176*c0909341SAndroid Build Coastguard Worker        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
177*c0909341SAndroid Build Coastguard Worker        sub             v30\sz, v30\sz, v4\sz                     // (dif >> 48) - v
178*c0909341SAndroid Build Coastguard Worker        smov            w15, v31.b[0]                             // 2*ret
179*c0909341SAndroid Build Coastguard Worker        sub             v29\sz, v29\sz, v4\sz                     // rng = u-v
180*c0909341SAndroid Build Coastguard Worker.elseif \n == 16
181*c0909341SAndroid Build Coastguard Worker        add             v6\sz,  v2\sz,  v3\sz
182*c0909341SAndroid Build Coastguard Worker        addv            h31, v6\sz                                // -n + ret
183*c0909341SAndroid Build Coastguard Worker        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
184*c0909341SAndroid Build Coastguard Worker        smov            w15, v31.h[0]
185*c0909341SAndroid Build Coastguard Worker.endif
186*c0909341SAndroid Build Coastguard Worker
187*c0909341SAndroid Build Coastguard Worker        cbz             w4,  0f
188*c0909341SAndroid Build Coastguard Worker
189*c0909341SAndroid Build Coastguard Worker        // update_cdf
190*c0909341SAndroid Build Coastguard Worker        ldrh            w3,  [x1, x2, lsl #1]                     // count = cdf[n_symbols]
191*c0909341SAndroid Build Coastguard Worker.if \n == 16
192*c0909341SAndroid Build Coastguard Worker        // 16 case has a lower bound that guarantees n_symbols > 2
193*c0909341SAndroid Build Coastguard Worker        mov             w4,  #-5
194*c0909341SAndroid Build Coastguard Worker.elseif \n == 8
195*c0909341SAndroid Build Coastguard Worker        mvn             w14, w2
196*c0909341SAndroid Build Coastguard Worker        mov             w4,  #-4
197*c0909341SAndroid Build Coastguard Worker        cmn             w14, #3                                   // set C if n_symbols <= 2
198*c0909341SAndroid Build Coastguard Worker.else
199*c0909341SAndroid Build Coastguard Worker        // if n_symbols < 4 (or < 6 even) then
200*c0909341SAndroid Build Coastguard Worker        //   (1 + n_symbols) >> 2 == n_symbols > 2
201*c0909341SAndroid Build Coastguard Worker        add             w14, w2,  #17                             // (1 + n_symbols) + (4 << 2)
202*c0909341SAndroid Build Coastguard Worker.endif
203*c0909341SAndroid Build Coastguard Worker        sub_n           v16, v17, v0,  v1,  v2,  v3,  \sz, \n     // cdf + (i >= val ? 1 : 0)
204*c0909341SAndroid Build Coastguard Worker        orr             v2\sz, #0x80, lsl #8
205*c0909341SAndroid Build Coastguard Worker.if \n == 16
206*c0909341SAndroid Build Coastguard Worker        orr             v3\sz, #0x80, lsl #8
207*c0909341SAndroid Build Coastguard Worker.endif
208*c0909341SAndroid Build Coastguard Worker.if \n == 16
209*c0909341SAndroid Build Coastguard Worker        sub             w4,  w4,  w3, lsr #4                      // -((count >> 4) + 5)
210*c0909341SAndroid Build Coastguard Worker.elseif \n == 8
211*c0909341SAndroid Build Coastguard Worker        lsr             w14, w3,  #4                              // count >> 4
212*c0909341SAndroid Build Coastguard Worker        sbc             w4,  w4,  w14                             // -((count >> 4) + (n_symbols > 2) + 4)
213*c0909341SAndroid Build Coastguard Worker.else
214*c0909341SAndroid Build Coastguard Worker        neg             w4, w14, lsr #2                           // -((n_symbols > 2) + 4)
215*c0909341SAndroid Build Coastguard Worker        sub             w4,  w4,  w3,  lsr #4                     // -((count >> 4) + (n_symbols > 2) + 4)
216*c0909341SAndroid Build Coastguard Worker.endif
217*c0909341SAndroid Build Coastguard Worker        sub_n           v2,  v3,  v2,  v3,  v0,  v1,  \sz, \n     // (32768 - cdf[i]) or (-1 - cdf[i])
218*c0909341SAndroid Build Coastguard Worker        dup             v6\sz,    w4                              // -rate
219*c0909341SAndroid Build Coastguard Worker
220*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  w3, lsr #5                      // count - (count == 32)
221*c0909341SAndroid Build Coastguard Worker        sshl_n          v2,  v3,  v2,  v3,  v6,  v6,  \sz, \n     // ({32768,-1} - cdf[i]) >> rate
222*c0909341SAndroid Build Coastguard Worker        add             w3,  w3,  #1                              // count + (count < 32)
223*c0909341SAndroid Build Coastguard Worker        add_n           v0,  v1,  v16, v17, v2,  v3,  \sz, \n     // cdf + (32768 - cdf[i]) >> rate
224*c0909341SAndroid Build Coastguard Worker        st1_n           v0,  v1,  x1,  \sz, \n
225*c0909341SAndroid Build Coastguard Worker        strh            w3,  [x1, x2, lsl #1]
226*c0909341SAndroid Build Coastguard Worker
227*c0909341SAndroid Build Coastguard Worker0:
228*c0909341SAndroid Build Coastguard Worker        // renorm
229*c0909341SAndroid Build Coastguard Worker.if \n == 4
230*c0909341SAndroid Build Coastguard Worker        ldr             w6,  [x0, #CNT]
231*c0909341SAndroid Build Coastguard Worker        ldr             x7,  [x0, #DIF]
232*c0909341SAndroid Build Coastguard Worker        mov             x4,  v29.d[0]          // rng (packed)
233*c0909341SAndroid Build Coastguard Worker        mov             x3,  v4.d[0]           // v (packed)
234*c0909341SAndroid Build Coastguard Worker
235*c0909341SAndroid Build Coastguard Worker        // Shift 'v'/'rng' for ret into the 16 least sig bits. There is
236*c0909341SAndroid Build Coastguard Worker        //  garbage in the remaining bits, but we can work around this.
237*c0909341SAndroid Build Coastguard Worker        lsr             x4,  x4,  x15          // rng
238*c0909341SAndroid Build Coastguard Worker        lsr             x3,  x3,  x15          // v
239*c0909341SAndroid Build Coastguard Worker        lsl             w5,  w4,  #16          // rng << 16
240*c0909341SAndroid Build Coastguard Worker        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
241*c0909341SAndroid Build Coastguard Worker        clz             w5,  w5                // d = clz(rng << 16)
242*c0909341SAndroid Build Coastguard Worker        lsl             w4,  w4,  w5           // rng << d
243*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  w5           // cnt -= d
244*c0909341SAndroid Build Coastguard Worker        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
245*c0909341SAndroid Build Coastguard Worker        strh            w4,  [x0, #RNG]
246*c0909341SAndroid Build Coastguard Worker        b.lo            1f
247*c0909341SAndroid Build Coastguard Worker        str             w6,  [x0, #CNT]
248*c0909341SAndroid Build Coastguard Worker        str             x7,  [x0, #DIF]
249*c0909341SAndroid Build Coastguard Worker        lsr             w0,  w15, #4
250*c0909341SAndroid Build Coastguard Worker        ret
251*c0909341SAndroid Build Coastguard Worker1:
252*c0909341SAndroid Build Coastguard Worker        lsr             w15, w15, #4
253*c0909341SAndroid Build Coastguard Worker        b L(refill)
254*c0909341SAndroid Build Coastguard Worker.elseif \n == 8
255*c0909341SAndroid Build Coastguard Worker        ldr             w6,  [x0, #CNT]
256*c0909341SAndroid Build Coastguard Worker        tbl             v30.8b, {v30.16b}, v31.8b
257*c0909341SAndroid Build Coastguard Worker        tbl             v29.8b, {v29.16b}, v31.8b
258*c0909341SAndroid Build Coastguard Worker        ins             v28.h[3], v30.h[0]     // dif - (v << 48)
259*c0909341SAndroid Build Coastguard Worker        clz             v0.4h,  v29.4h         // d = clz(rng)
260*c0909341SAndroid Build Coastguard Worker        umov            w5,  v0.h[0]
261*c0909341SAndroid Build Coastguard Worker        ushl            v29.4h, v29.4h, v0.4h  // rng << d
262*c0909341SAndroid Build Coastguard Worker
263*c0909341SAndroid Build Coastguard Worker        // The vec for clz(rng) is filled with garbage after the first short,
264*c0909341SAndroid Build Coastguard Worker        //  but ushl/sshl conveniently uses only the first byte for the shift
265*c0909341SAndroid Build Coastguard Worker        //  amount.
266*c0909341SAndroid Build Coastguard Worker        ushl            d28, d28, d0           // (dif - (v << 48)) << d
267*c0909341SAndroid Build Coastguard Worker
268*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  w5           // cnt -= d
269*c0909341SAndroid Build Coastguard Worker        str             h29, [x0, #RNG]
270*c0909341SAndroid Build Coastguard Worker        b.lo            1f
271*c0909341SAndroid Build Coastguard Worker        str             w6,  [x0, #CNT]
272*c0909341SAndroid Build Coastguard Worker        str             d28, [x0, #DIF]
273*c0909341SAndroid Build Coastguard Worker        lsr             w0,  w15, #1           // ret
274*c0909341SAndroid Build Coastguard Worker        ret
275*c0909341SAndroid Build Coastguard Worker1:
276*c0909341SAndroid Build Coastguard Worker        lsr             w15, w15, #1           // ret
277*c0909341SAndroid Build Coastguard Worker        mov             x7, v28.d[0]
278*c0909341SAndroid Build Coastguard Worker        b L(refill)
279*c0909341SAndroid Build Coastguard Worker.elseif \n == 16
280*c0909341SAndroid Build Coastguard Worker        add             x8,  sp,  w15, sxtw #1
281*c0909341SAndroid Build Coastguard Worker        ldrh            w3,  [x8, #48]         // v
282*c0909341SAndroid Build Coastguard Worker        ldurh           w4,  [x8, #46]         // u
283*c0909341SAndroid Build Coastguard Worker        ldr             w6,  [x0, #CNT]
284*c0909341SAndroid Build Coastguard Worker        ldr             x7,  [x0, #DIF]
285*c0909341SAndroid Build Coastguard Worker        sub             w4,  w4,  w3           // rng = u - v
286*c0909341SAndroid Build Coastguard Worker        clz             w5,  w4                // clz(rng)
287*c0909341SAndroid Build Coastguard Worker        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
288*c0909341SAndroid Build Coastguard Worker        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
289*c0909341SAndroid Build Coastguard Worker        lsl             w4,  w4,  w5           // rng << d
290*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  w5           // cnt -= d
291*c0909341SAndroid Build Coastguard Worker        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
292*c0909341SAndroid Build Coastguard Worker        str             w4,  [x0, #RNG]
293*c0909341SAndroid Build Coastguard Worker        add             sp,  sp,  #48
294*c0909341SAndroid Build Coastguard Worker        b.lo            1f
295*c0909341SAndroid Build Coastguard Worker        str             w6,  [x0, #CNT]
296*c0909341SAndroid Build Coastguard Worker        str             x7,  [x0, #DIF]
297*c0909341SAndroid Build Coastguard Worker        add             w0,  w15, #\n          // ret
298*c0909341SAndroid Build Coastguard Worker        ret
299*c0909341SAndroid Build Coastguard Worker1:
300*c0909341SAndroid Build Coastguard Worker        add             w15, w15, #\n          // ret
301*c0909341SAndroid Build Coastguard Worker        b L(refill)
302*c0909341SAndroid Build Coastguard Worker.endif
303*c0909341SAndroid Build Coastguard Worker.endm
304*c0909341SAndroid Build Coastguard Worker
305*c0909341SAndroid Build Coastguard Worker        decode_update   .4h, .8b, 4
306*c0909341SAndroid Build Coastguard Worker
307*c0909341SAndroid Build Coastguard WorkerL(refill):
308*c0909341SAndroid Build Coastguard Worker        // refill
309*c0909341SAndroid Build Coastguard Worker        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
310*c0909341SAndroid Build Coastguard Worker        add             x5,  x3,  #8
311*c0909341SAndroid Build Coastguard Worker        subs            x5,  x5,  x4
312*c0909341SAndroid Build Coastguard Worker        b.hi            6f
313*c0909341SAndroid Build Coastguard Worker
314*c0909341SAndroid Build Coastguard Worker        ldr             x8,  [x3]              // next_bits
315*c0909341SAndroid Build Coastguard Worker        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)
316*c0909341SAndroid Build Coastguard Worker        mvn             x8,  x8
317*c0909341SAndroid Build Coastguard Worker        neg             w5,  w4
318*c0909341SAndroid Build Coastguard Worker        rev             x8,  x8                // next_bits = bswap(next_bits)
319*c0909341SAndroid Build Coastguard Worker        lsr             w5,  w5,  #3           // num_bytes_read
320*c0909341SAndroid Build Coastguard Worker        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)
321*c0909341SAndroid Build Coastguard Worker
322*c0909341SAndroid Build Coastguard Worker2:      // refill_end
323*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x5
324*c0909341SAndroid Build Coastguard Worker        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read
325*c0909341SAndroid Build Coastguard Worker        str             x3,  [x0, #BUF_POS]
326*c0909341SAndroid Build Coastguard Worker
327*c0909341SAndroid Build Coastguard Worker3:      // refill_end2
328*c0909341SAndroid Build Coastguard Worker        orr             x7,  x7,  x8           // dif |= next_bits
329*c0909341SAndroid Build Coastguard Worker
330*c0909341SAndroid Build Coastguard Worker4:      // end
331*c0909341SAndroid Build Coastguard Worker        str             w6,  [x0, #CNT]
332*c0909341SAndroid Build Coastguard Worker        str             x7,  [x0, #DIF]
333*c0909341SAndroid Build Coastguard Worker
334*c0909341SAndroid Build Coastguard Worker        mov             w0,  w15
335*c0909341SAndroid Build Coastguard Worker        ret
336*c0909341SAndroid Build Coastguard Worker
337*c0909341SAndroid Build Coastguard Worker5:      // pad_with_ones
338*c0909341SAndroid Build Coastguard Worker        add             w8,  w6,  #-16
339*c0909341SAndroid Build Coastguard Worker        ror             x8,  x8,  x8
340*c0909341SAndroid Build Coastguard Worker        b               3b
341*c0909341SAndroid Build Coastguard Worker
342*c0909341SAndroid Build Coastguard Worker6:      // refill_eob
343*c0909341SAndroid Build Coastguard Worker        cmp             x3,  x4
344*c0909341SAndroid Build Coastguard Worker        b.hs            5b
345*c0909341SAndroid Build Coastguard Worker
346*c0909341SAndroid Build Coastguard Worker        ldr             x8,  [x4, #-8]
347*c0909341SAndroid Build Coastguard Worker        lsl             w5,  w5,  #3
348*c0909341SAndroid Build Coastguard Worker        lsr             x8,  x8,  x5
349*c0909341SAndroid Build Coastguard Worker        add             w5,  w6,  #-48
350*c0909341SAndroid Build Coastguard Worker        mvn             x8,  x8
351*c0909341SAndroid Build Coastguard Worker        sub             w4,  w4,  w3           // num_bytes_left
352*c0909341SAndroid Build Coastguard Worker        rev             x8,  x8
353*c0909341SAndroid Build Coastguard Worker        lsr             x8,  x8,  x5
354*c0909341SAndroid Build Coastguard Worker        neg             w5,  w5
355*c0909341SAndroid Build Coastguard Worker        lsr             w5,  w5,  #3
356*c0909341SAndroid Build Coastguard Worker        cmp             w5,  w4
357*c0909341SAndroid Build Coastguard Worker        csel            w5,  w5,  w4,  lo      // num_bytes_read
358*c0909341SAndroid Build Coastguard Worker        b               2b
359*c0909341SAndroid Build Coastguard Workerendfunc
360*c0909341SAndroid Build Coastguard Worker
361*c0909341SAndroid Build Coastguard Workerfunction msac_decode_symbol_adapt8_neon, export=1
362*c0909341SAndroid Build Coastguard Worker        decode_update   .8h, .16b, 8
363*c0909341SAndroid Build Coastguard Workerendfunc
364*c0909341SAndroid Build Coastguard Worker
365*c0909341SAndroid Build Coastguard Workerfunction msac_decode_symbol_adapt16_neon, export=1
366*c0909341SAndroid Build Coastguard Worker        decode_update   .8h, .16b, 16
367*c0909341SAndroid Build Coastguard Workerendfunc
368*c0909341SAndroid Build Coastguard Worker
369*c0909341SAndroid Build Coastguard Workerfunction msac_decode_hi_tok_neon, export=1
370*c0909341SAndroid Build Coastguard Worker        ld1             {v0.4h},  [x1]            // cdf
371*c0909341SAndroid Build Coastguard Worker        add             x16, x0,  #RNG
372*c0909341SAndroid Build Coastguard Worker        movi            v31.4h, #0x7f, lsl #8     // 0x7f00
373*c0909341SAndroid Build Coastguard Worker        movrel          x17, coeffs, COEFFS_BASE_OFFSET-2*3
374*c0909341SAndroid Build Coastguard Worker        mvni            v30.4h, #0x3f             // 0xffc0
375*c0909341SAndroid Build Coastguard Worker        ldrh            w9,  [x1, #6]             // count = cdf[n_symbols]
376*c0909341SAndroid Build Coastguard Worker        ld1r            {v3.4h},  [x16]           // rng
377*c0909341SAndroid Build Coastguard Worker        ld1             {v29.4h}, [x17]           // EC_MIN_PROB * (n_symbols - ret)
378*c0909341SAndroid Build Coastguard Worker        add             x17, x0,  #DIF + 6
379*c0909341SAndroid Build Coastguard Worker        mov             w13, #-24*8
380*c0909341SAndroid Build Coastguard Worker        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
381*c0909341SAndroid Build Coastguard Worker        ldr             w10, [x0, #ALLOW_UPDATE_CDF]
382*c0909341SAndroid Build Coastguard Worker        ld1r            {v1.8h},  [x17]           // dif >> (EC_WIN_SIZE - 16)
383*c0909341SAndroid Build Coastguard Worker        ldr             w6,  [x0, #CNT]
384*c0909341SAndroid Build Coastguard Worker        ldr             x7,  [x0, #DIF]
385*c0909341SAndroid Build Coastguard Worker1:
386*c0909341SAndroid Build Coastguard Worker        and             v7.8b,   v3.8b,   v31.8b  // rng & 0x7f00
387*c0909341SAndroid Build Coastguard Worker        sqdmulh         v6.4h,   v17.4h,  v7.4h   // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
388*c0909341SAndroid Build Coastguard Worker        add             v4.4h,   v17.4h,  v29.4h  // v = cdf + EC_MIN_PROB * (n_symbols - ret)
389*c0909341SAndroid Build Coastguard Worker        add             v4.4h,   v6.4h,   v4.4h   // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
390*c0909341SAndroid Build Coastguard Worker        cmhs            v2.4h,   v1.4h,   v4.4h   // c >= v
391*c0909341SAndroid Build Coastguard Worker        add             w13, w13, #5*8
392*c0909341SAndroid Build Coastguard Worker        ext             v18.8b, v3.8b,  v4.8b, #6 // u
393*c0909341SAndroid Build Coastguard Worker        umov            x15, v2.d[0]
394*c0909341SAndroid Build Coastguard Worker        rev             x15, x15
395*c0909341SAndroid Build Coastguard Worker        sub             v18.4h, v18.4h, v4.4h     // rng = u-v
396*c0909341SAndroid Build Coastguard Worker        // rev + clz = count trailing zeros
397*c0909341SAndroid Build Coastguard Worker        clz             x15, x15                  // 16*ret
398*c0909341SAndroid Build Coastguard Worker
399*c0909341SAndroid Build Coastguard Worker        cbz             w10, 2f
400*c0909341SAndroid Build Coastguard Worker        // update_cdf
401*c0909341SAndroid Build Coastguard Worker        sub             v5.4h,   v0.4h,   v2.4h   // cdf[i] + (i >= val ? 1 : 0)
402*c0909341SAndroid Build Coastguard Worker        mov             w4,  #-5
403*c0909341SAndroid Build Coastguard Worker        orr             v2.4h, #0x80, lsl #8      // i >= val ? -1 : 32768
404*c0909341SAndroid Build Coastguard Worker        sub             w4,  w4,  w9, lsr #4      // -((count >> 4) + 5)
405*c0909341SAndroid Build Coastguard Worker        sub             v2.4h,   v2.4h,   v0.4h   // (32768 - cdf[i]) or (-1 - cdf[i])
406*c0909341SAndroid Build Coastguard Worker        dup             v6.4h,    w4              // -rate
407*c0909341SAndroid Build Coastguard Worker
408*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  w9, lsr #5      // count - (count == 32)
409*c0909341SAndroid Build Coastguard Worker        sshl            v2.4h,   v2.4h,   v6.4h   // ({32768,-1} - cdf[i]) >> rate
410*c0909341SAndroid Build Coastguard Worker        add             w9,  w9,  #1              // count + (count < 32)
411*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v5.4h,   v2.4h   // cdf[i] + (32768 - cdf[i]) >> rate
412*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x1]
413*c0909341SAndroid Build Coastguard Worker        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
414*c0909341SAndroid Build Coastguard Worker        strh            w9,  [x1, #6]
415*c0909341SAndroid Build Coastguard Worker
416*c0909341SAndroid Build Coastguard Worker2:
417*c0909341SAndroid Build Coastguard Worker        mov             x4,  v18.d[0]          // rng (packed)
418*c0909341SAndroid Build Coastguard Worker        mov             x3,  v4.d[0]           // v (packed)
419*c0909341SAndroid Build Coastguard Worker
420*c0909341SAndroid Build Coastguard Worker        // Shift 'v'/'rng' for ret into the 16 least sig bits. There is
421*c0909341SAndroid Build Coastguard Worker        //  garbage in the remaining bits, but we can work around this.
422*c0909341SAndroid Build Coastguard Worker        lsr             x4,  x4,  x15          // rng
423*c0909341SAndroid Build Coastguard Worker        lsr             x3,  x3,  x15          // v
424*c0909341SAndroid Build Coastguard Worker        lsl             w5,  w4,  #16          // rng << 16
425*c0909341SAndroid Build Coastguard Worker        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
426*c0909341SAndroid Build Coastguard Worker        clz             w5,  w5                // d = clz(rng << 16)
427*c0909341SAndroid Build Coastguard Worker        lsl             w4,  w4,  w5           // rng << d
428*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  w5           // cnt -= d
429*c0909341SAndroid Build Coastguard Worker        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
430*c0909341SAndroid Build Coastguard Worker        strh            w4,  [x0, #RNG]
431*c0909341SAndroid Build Coastguard Worker        dup             v3.4h,   w4
432*c0909341SAndroid Build Coastguard Worker        b.hs            5f
433*c0909341SAndroid Build Coastguard Worker
434*c0909341SAndroid Build Coastguard Worker        // refill
435*c0909341SAndroid Build Coastguard Worker        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
436*c0909341SAndroid Build Coastguard Worker        add             x5,  x3,  #8
437*c0909341SAndroid Build Coastguard Worker        subs            x5,  x5,  x4
438*c0909341SAndroid Build Coastguard Worker        b.hi            7f
439*c0909341SAndroid Build Coastguard Worker
440*c0909341SAndroid Build Coastguard Worker        ldr             x8,  [x3]              // next_bits
441*c0909341SAndroid Build Coastguard Worker        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)
442*c0909341SAndroid Build Coastguard Worker        mvn             x8,  x8
443*c0909341SAndroid Build Coastguard Worker        neg             w5,  w4
444*c0909341SAndroid Build Coastguard Worker        rev             x8,  x8                // next_bits = bswap(next_bits)
445*c0909341SAndroid Build Coastguard Worker        lsr             w5,  w5,  #3           // num_bytes_read
446*c0909341SAndroid Build Coastguard Worker        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)
447*c0909341SAndroid Build Coastguard Worker
448*c0909341SAndroid Build Coastguard Worker3:      // refill_end
449*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x5
450*c0909341SAndroid Build Coastguard Worker        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read
451*c0909341SAndroid Build Coastguard Worker        str             x3,  [x0, #BUF_POS]
452*c0909341SAndroid Build Coastguard Worker
453*c0909341SAndroid Build Coastguard Worker4:      // refill_end2
454*c0909341SAndroid Build Coastguard Worker        orr             x7,  x7,  x8           // dif |= next_bits
455*c0909341SAndroid Build Coastguard Worker
456*c0909341SAndroid Build Coastguard Worker5:      // end
457*c0909341SAndroid Build Coastguard Worker        sub             w15, w15, #5*8
458*c0909341SAndroid Build Coastguard Worker        lsr             x12, x7,  #48
459*c0909341SAndroid Build Coastguard Worker        adds            w13, w13, w15          // carry = tok_br < 3 || tok == 15
460*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   w12
461*c0909341SAndroid Build Coastguard Worker        b.cc            1b                     // loop if !carry
462*c0909341SAndroid Build Coastguard Worker        add             w13, w13, #30*8
463*c0909341SAndroid Build Coastguard Worker        str             w6,  [x0, #CNT]
464*c0909341SAndroid Build Coastguard Worker        str             x7,  [x0, #DIF]
465*c0909341SAndroid Build Coastguard Worker        lsr             w0,  w13, #4
466*c0909341SAndroid Build Coastguard Worker        ret
467*c0909341SAndroid Build Coastguard Worker
468*c0909341SAndroid Build Coastguard Worker6:      // pad_with_ones
469*c0909341SAndroid Build Coastguard Worker        add             w8,  w6,  #-16
470*c0909341SAndroid Build Coastguard Worker        ror             x8,  x8,  x8
471*c0909341SAndroid Build Coastguard Worker        b               4b
472*c0909341SAndroid Build Coastguard Worker
473*c0909341SAndroid Build Coastguard Worker7:      // refill_eob
474*c0909341SAndroid Build Coastguard Worker        cmp             x3,  x4
475*c0909341SAndroid Build Coastguard Worker        b.hs            6b
476*c0909341SAndroid Build Coastguard Worker
477*c0909341SAndroid Build Coastguard Worker        ldr             x8,  [x4, #-8]
478*c0909341SAndroid Build Coastguard Worker        lsl             w5,  w5,  #3
479*c0909341SAndroid Build Coastguard Worker        lsr             x8,  x8,  x5
480*c0909341SAndroid Build Coastguard Worker        add             w5,  w6,  #-48
481*c0909341SAndroid Build Coastguard Worker        mvn             x8,  x8
482*c0909341SAndroid Build Coastguard Worker        sub             w4,  w4,  w3           // num_bytes_left
483*c0909341SAndroid Build Coastguard Worker        rev             x8,  x8
484*c0909341SAndroid Build Coastguard Worker        lsr             x8,  x8,  x5
485*c0909341SAndroid Build Coastguard Worker        neg             w5,  w5
486*c0909341SAndroid Build Coastguard Worker        lsr             w5,  w5,  #3
487*c0909341SAndroid Build Coastguard Worker        cmp             w5,  w4
488*c0909341SAndroid Build Coastguard Worker        csel            w5,  w5,  w4,  lo      // num_bytes_read
489*c0909341SAndroid Build Coastguard Worker        b               3b
490*c0909341SAndroid Build Coastguard Workerendfunc
491*c0909341SAndroid Build Coastguard Worker
492*c0909341SAndroid Build Coastguard Workerfunction msac_decode_bool_equi_neon, export=1
493*c0909341SAndroid Build Coastguard Worker        ldp             w5,  w6,  [x0, #RNG]   // + CNT
494*c0909341SAndroid Build Coastguard Worker        ldr             x7,  [x0, #DIF]
495*c0909341SAndroid Build Coastguard Worker        bic             w4,  w5,  #0xff        // r &= 0xff00
496*c0909341SAndroid Build Coastguard Worker        add             w4,  w4,  #8
497*c0909341SAndroid Build Coastguard Worker        subs            x8,  x7,  x4, lsl #47  // dif - vw
498*c0909341SAndroid Build Coastguard Worker        lsr             w4,  w4,  #1           // v
499*c0909341SAndroid Build Coastguard Worker        sub             w5,  w5,  w4           // r - v
500*c0909341SAndroid Build Coastguard Worker        cset            w15, lo
501*c0909341SAndroid Build Coastguard Worker        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
502*c0909341SAndroid Build Coastguard Worker        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
503*c0909341SAndroid Build Coastguard Worker
504*c0909341SAndroid Build Coastguard Worker        clz             w5,  w4                // clz(rng)
505*c0909341SAndroid Build Coastguard Worker        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
506*c0909341SAndroid Build Coastguard Worker        lsl             w4,  w4,  w5           // rng << d
507*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  w5           // cnt -= d
508*c0909341SAndroid Build Coastguard Worker        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
509*c0909341SAndroid Build Coastguard Worker        str             w4,  [x0, #RNG]
510*c0909341SAndroid Build Coastguard Worker        b.lo            L(refill)
511*c0909341SAndroid Build Coastguard Worker
512*c0909341SAndroid Build Coastguard Worker        str             w6,  [x0, #CNT]
513*c0909341SAndroid Build Coastguard Worker        str             x7,  [x0, #DIF]
514*c0909341SAndroid Build Coastguard Worker        mov             w0,  w15
515*c0909341SAndroid Build Coastguard Worker        ret
516*c0909341SAndroid Build Coastguard Workerendfunc
517*c0909341SAndroid Build Coastguard Worker
518*c0909341SAndroid Build Coastguard Workerfunction msac_decode_bool_neon, export=1
519*c0909341SAndroid Build Coastguard Worker        ldp             w5,  w6,  [x0, #RNG]   // + CNT
520*c0909341SAndroid Build Coastguard Worker        ldr             x7,  [x0, #DIF]
521*c0909341SAndroid Build Coastguard Worker        lsr             w4,  w5,  #8           // r >> 8
522*c0909341SAndroid Build Coastguard Worker        bic             w1,  w1,  #0x3f        // f &= ~63
523*c0909341SAndroid Build Coastguard Worker        mul             w4,  w4,  w1
524*c0909341SAndroid Build Coastguard Worker        lsr             w4,  w4,  #7
525*c0909341SAndroid Build Coastguard Worker        add             w4,  w4,  #4           // v
526*c0909341SAndroid Build Coastguard Worker        subs            x8,  x7,  x4, lsl #48  // dif - vw
527*c0909341SAndroid Build Coastguard Worker        sub             w5,  w5,  w4           // r - v
528*c0909341SAndroid Build Coastguard Worker        cset            w15, lo
529*c0909341SAndroid Build Coastguard Worker        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
530*c0909341SAndroid Build Coastguard Worker        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
531*c0909341SAndroid Build Coastguard Worker
532*c0909341SAndroid Build Coastguard Worker        clz             w5,  w4                // clz(rng)
533*c0909341SAndroid Build Coastguard Worker        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
534*c0909341SAndroid Build Coastguard Worker        lsl             w4,  w4,  w5           // rng << d
535*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  w5           // cnt -= d
536*c0909341SAndroid Build Coastguard Worker        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
537*c0909341SAndroid Build Coastguard Worker        str             w4,  [x0, #RNG]
538*c0909341SAndroid Build Coastguard Worker        b.lo            L(refill)
539*c0909341SAndroid Build Coastguard Worker
540*c0909341SAndroid Build Coastguard Worker        str             w6,  [x0, #CNT]
541*c0909341SAndroid Build Coastguard Worker        str             x7,  [x0, #DIF]
542*c0909341SAndroid Build Coastguard Worker        mov             w0,  w15
543*c0909341SAndroid Build Coastguard Worker        ret
544*c0909341SAndroid Build Coastguard Workerendfunc
545*c0909341SAndroid Build Coastguard Worker
546*c0909341SAndroid Build Coastguard Workerfunction msac_decode_bool_adapt_neon, export=1
547*c0909341SAndroid Build Coastguard Worker        ldr             w9,  [x1]              // cdf[0-1]
548*c0909341SAndroid Build Coastguard Worker        ldp             w5,  w6,  [x0, #RNG]   // + CNT
549*c0909341SAndroid Build Coastguard Worker        ldr             x7,  [x0, #DIF]
550*c0909341SAndroid Build Coastguard Worker        lsr             w4,  w5,  #8           // r >> 8
551*c0909341SAndroid Build Coastguard Worker        and             w2,  w9,  #0xffc0      // f &= ~63
552*c0909341SAndroid Build Coastguard Worker        mul             w4,  w4,  w2
553*c0909341SAndroid Build Coastguard Worker        lsr             w4,  w4,  #7
554*c0909341SAndroid Build Coastguard Worker        add             w4,  w4,  #4           // v
555*c0909341SAndroid Build Coastguard Worker        subs            x8,  x7,  x4, lsl #48  // dif - vw
556*c0909341SAndroid Build Coastguard Worker        sub             w5,  w5,  w4           // r - v
557*c0909341SAndroid Build Coastguard Worker        cset            w15, lo
558*c0909341SAndroid Build Coastguard Worker        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
559*c0909341SAndroid Build Coastguard Worker        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
560*c0909341SAndroid Build Coastguard Worker
561*c0909341SAndroid Build Coastguard Worker        ldr             w10, [x0, #ALLOW_UPDATE_CDF]
562*c0909341SAndroid Build Coastguard Worker
563*c0909341SAndroid Build Coastguard Worker        clz             w5,  w4                // clz(rng)
564*c0909341SAndroid Build Coastguard Worker        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
565*c0909341SAndroid Build Coastguard Worker
566*c0909341SAndroid Build Coastguard Worker        cbz             w10, 1f
567*c0909341SAndroid Build Coastguard Worker
568*c0909341SAndroid Build Coastguard Worker        lsr             w2,  w9,  #16          // count = cdf[1]
569*c0909341SAndroid Build Coastguard Worker        and             w9,  w9,  #0xffff      // cdf[0]
570*c0909341SAndroid Build Coastguard Worker
571*c0909341SAndroid Build Coastguard Worker        sub             w3,  w2,  w2, lsr #5   // count - (count >= 32)
572*c0909341SAndroid Build Coastguard Worker        lsr             w2,  w2,  #4           // count >> 4
573*c0909341SAndroid Build Coastguard Worker        add             w10, w3,  #1           // count + (count < 32)
574*c0909341SAndroid Build Coastguard Worker        add             w2,  w2,  #4           // rate = (count >> 4) | 4
575*c0909341SAndroid Build Coastguard Worker
576*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  w15          // cdf[0] -= bit
577*c0909341SAndroid Build Coastguard Worker        sub             w11, w9,  w15, lsl #15 // {cdf[0], cdf[0] - 32769}
578*c0909341SAndroid Build Coastguard Worker        asr             w11, w11, w2           // {cdf[0], cdf[0] - 32769} >> rate
579*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  w11          // cdf[0]
580*c0909341SAndroid Build Coastguard Worker
581*c0909341SAndroid Build Coastguard Worker        strh            w9,  [x1]
582*c0909341SAndroid Build Coastguard Worker        strh            w10, [x1, #2]
583*c0909341SAndroid Build Coastguard Worker
584*c0909341SAndroid Build Coastguard Worker1:
585*c0909341SAndroid Build Coastguard Worker        lsl             w4,  w4,  w5           // rng << d
586*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  w5           // cnt -= d
587*c0909341SAndroid Build Coastguard Worker        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
588*c0909341SAndroid Build Coastguard Worker        str             w4,  [x0, #RNG]
589*c0909341SAndroid Build Coastguard Worker        b.lo            L(refill)
590*c0909341SAndroid Build Coastguard Worker
591*c0909341SAndroid Build Coastguard Worker        str             w6,  [x0, #CNT]
592*c0909341SAndroid Build Coastguard Worker        str             x7,  [x0, #DIF]
593*c0909341SAndroid Build Coastguard Worker        mov             w0,  w15
594*c0909341SAndroid Build Coastguard Worker        ret
595*c0909341SAndroid Build Coastguard Workerendfunc
596