xref: /aosp_15_r20/external/libdav1d/src/arm/32/msac.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2019, VideoLAN and dav1d authors
3 * Copyright © 2020, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31#define BUF_POS 0
32#define BUF_END 4
33#define DIF 8
34#define RNG 12
35#define CNT 16
36#define ALLOW_UPDATE_CDF 20
37
38const coeffs
39        .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
40        .short 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0
41endconst
42
43const bits, align=4
44        .short   0x1,   0x2,   0x4,   0x8,   0x10,   0x20,   0x40,   0x80
45        .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
46endconst
47
48.macro vld1_align_n d0, q0, q1, src, n
49.if \n == 4
50        vld1.16         {\d0},  [\src, :64]
51.elseif \n == 8
52        vld1.16         {\q0},  [\src, :128]
53.else
54        vld1.16         {\q0, \q1},  [\src, :128]
55.endif
56.endm
57
58.macro vld1_n d0, q0, q1, src, n
59.if \n == 4
60        vld1.16         {\d0},  [\src]
61.elseif \n == 8
62        vld1.16         {\q0},  [\src]
63.else
64        vld1.16         {\q0, \q1},  [\src]
65.endif
66.endm
67
68.macro vst1_align_n d0, q0, q1, src, n
69.if \n == 4
70        vst1.16         {\d0},  [\src, :64]
71.elseif \n == 8
72        vst1.16         {\q0},  [\src, :128]
73.else
74        vst1.16         {\q0, \q1},  [\src, :128]
75.endif
76.endm
77
78.macro vst1_n d0, q0, q1, src, n
79.if \n == 4
80        vst1.16         {\d0},  [\src]
81.elseif \n == 8
82        vst1.16         {\q0},  [\src]
83.else
84        vst1.16         {\q0, \q1},  [\src]
85.endif
86.endm
87
88.macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
89.if \n == 4
90        vshr.u16        \d0,  \s0,  \s3
91.else
92        vshr.u16        \d1,  \s1,  \s4
93.if \n == 16
94        vshr.u16        \d2,  \s2,  \s5
95.endif
96.endif
97.endm
98
99.macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
100.if \n == 4
101        vadd.i16        \d0,  \s0,  \s3
102.else
103        vadd.i16        \d1,  \s1,  \s4
104.if \n == 16
105        vadd.i16        \d2,  \s2,  \s5
106.endif
107.endif
108.endm
109
110.macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
111.if \n == 4
112        vsub.i16        \d0,  \s0,  \s3
113.else
114        vsub.i16        \d1,  \s1,  \s4
115.if \n == 16
116        vsub.i16        \d2,  \s2,  \s5
117.endif
118.endif
119.endm
120
121.macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
122.if \n == 4
123        vand            \d0,  \s0,  \s3
124.else
125        vand            \d1,  \s1,  \s4
126.if \n == 16
127        vand            \d2,  \s2,  \s5
128.endif
129.endif
130.endm
131
132.macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
133.if \n == 4
134        vcge.u16        \d0,  \s0,  \s3
135.else
136        vcge.u16        \d1,  \s1,  \s4
137.if \n == 16
138        vcge.u16        \d2,  \s2,  \s5
139.endif
140.endif
141.endm
142
143.macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
144.if \n == 4
145        vrhadd.u16      \d0,  \s0,  \s3
146.else
147        vrhadd.u16      \d1,  \s1,  \s4
148.if \n == 16
149        vrhadd.u16      \d2,  \s2,  \s5
150.endif
151.endif
152.endm
153
154.macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
155.if \n == 4
156        vshl.s16        \d0,  \s0,  \s3
157.else
158        vshl.s16        \d1,  \s1,  \s4
159.if \n == 16
160        vshl.s16        \d2,  \s2,  \s5
161.endif
162.endif
163.endm
164
165.macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
166.if \n == 4
167        vqdmulh.s16     \d0,  \s0,  \s3
168.else
169        vqdmulh.s16     \d1,  \s1,  \s4
170.if \n == 16
171        vqdmulh.s16     \d2,  \s2,  \s5
172.endif
173.endif
174.endm
175
176// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
177//                                               size_t n_symbols);
178
179function msac_decode_symbol_adapt4_neon, export=1
180.macro decode_update n
181        push            {r4-r10,lr}
182        sub             sp,  sp,  #48
183        add             r8,  r0,  #RNG
184
185        vld1_align_n    d0,  q0,  q1,  r1,  \n                         // cdf
186        vld1.16         {d16[]}, [r8, :16]                             // rng
187        movrel_local    r9,  coeffs, 30
188        vmov.i16        d30, #0x7f00                                   // 0x7f00
189        sub             r9,  r9,  r2, lsl #1
190        vmvn.i16        q14, #0x3f                                     // 0xffc0
191        add             r8,  sp,  #14
192        vand            d22, d16, d30                                  // rng & 0x7f00
193        vst1.16         {d16[0]}, [r8, :16]                            // store original u = s->rng
194        vand_n          d4,  q2,  q3,  d0,  q0,  q1, d28, q14, q14, \n // cdf & 0xffc0
195.if \n > 4
196        vmov            d23, d22
197.endif
198
199        vld1_n          d16, q8,  q9,  r9,  \n                          // EC_MIN_PROB * (n_symbols - ret)
200        vqdmulh_n       d20, q10, q11, d4,  q2,  q3,  d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
201        add             r8,  r0,  #DIF + 2
202
203        vadd_n          d16, q8,  q9,  d4,  q2,  q3,  d16, q8,  q9,  \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
204.if \n == 4
205        vmov.i16        d17, #0
206.endif
207        vadd_n          d16, q8,  q9,  d20, q10, q11, d16, q8,  q9,  \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
208
209        add             r9,  sp,  #16
210        vld1.16         {d20[]}, [r8, :16]                              // dif >> (EC_WIN_SIZE - 16)
211        movrel_local    r8,  bits
212        vst1_n          q8,  q8,  q9,  r9,  \n                          // store v values to allow indexed access
213
214        vmov            d21, d20
215        vld1_align_n    q12, q12, q13, r8,  \n
216.if \n == 16
217        vmov            q11, q10
218.endif
219
220        vcge_n          q2,  q2,  q3,  q10, q10, q11, q8,  q8,  q9,  \n // c >= v
221
222        vand_n          q10, q10, q11, q2,  q2,  q3,  q12, q12, q13, \n // One bit per halfword set in the mask
223.if \n == 16
224        vadd.i16        q10, q10, q11
225.endif
226        vadd.i16        d20, d20, d21                                   // Aggregate mask bits
227        ldr             r4,  [r0, #ALLOW_UPDATE_CDF]
228        vpadd.i16       d20, d20, d20
229        lsl             r10, r2,  #1
230        vpadd.i16       d20, d20, d20
231        vmov.u16        r3,  d20[0]
232        cmp             r4,  #0
233        rbit            r3,  r3
234        clz             lr,  r3                                         // ret
235
236        beq             L(renorm)
237        // update_cdf
238        ldrh            r3,  [r1, r10]                                  // count = cdf[n_symbols]
239        vmov.i8         q10, #0xff
240.if \n == 16
241        mov             r4,  #-5
242.else
243        mvn             r12, r2
244        mov             r4,  #-4
245        cmn             r12, #3                                         // set C if n_symbols <= 2
246.endif
247        vrhadd_n        d16, q8,  q9,  d20, q10, q10, d4,  q2,  q3,  \n // i >= val ? -1 : 32768
248.if \n == 16
249        sub             r4,  r4,  r3, lsr #4                            // -((count >> 4) + 5)
250.else
251        lsr             r12, r3,  #4                                    // count >> 4
252        sbc             r4,  r4,  r12                                   // -((count >> 4) + (n_symbols > 2) + 4)
253.endif
254        vsub_n          d16, q8,  q9,  d16, q8,  q9,  d0,  q0,  q1,  \n // (32768 - cdf[i]) or (-1 - cdf[i])
255.if \n == 4
256        vdup.16         d20, r4                                         // -rate
257.else
258        vdup.16         q10, r4                                         // -rate
259.endif
260
261        sub             r3,  r3,  r3, lsr #5                            // count - (count == 32)
262        vsub_n          d0,  q0,  q1,  d0,  q0,  q1,  d4,  q2,  q3,  \n // cdf + (i >= val ? 1 : 0)
263        vshl_n          d16, q8,  q9,  d16, q8,  q9,  d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate
264        add             r3,  r3,  #1                                    // count + (count < 32)
265        vadd_n          d0,  q0,  q1,  d0,  q0,  q1,  d16, q8,  q9,  \n // cdf + (32768 - cdf[i]) >> rate
266        vst1_align_n    d0,  q0,  q1,  r1,  \n
267        strh            r3,  [r1, r10]
268.endm
269
270        decode_update   4
271
272L(renorm):
273        add             r8,  sp,  #16
274        add             r8,  r8,  lr, lsl #1
275        ldrh            r3,  [r8]              // v
276        ldrh            r4,  [r8, #-2]         // u
277        ldr             r6,  [r0, #CNT]
278        ldr             r7,  [r0, #DIF]
279        sub             r4,  r4,  r3           // rng = u - v
280        clz             r5,  r4                // clz(rng)
281        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
282        sub             r7,  r7,  r3, lsl #16  // dif - (v << 16)
283L(renorm2):
284        lsl             r4,  r4,  r5           // rng << d
285        subs            r6,  r6,  r5           // cnt -= d
286        lsl             r7,  r7,  r5           // (dif - (v << 16)) << d
287        str             r4,  [r0, #RNG]
288        bhs             4f
289
290        // refill
291        ldr             r3,  [r0, #BUF_POS]    // BUF_POS
292        ldr             r4,  [r0, #BUF_END]    // BUF_END
293        add             r5,  r3,  #4
294        subs            r5,  r5,  r4
295        bhi             6f
296
297        ldr             r8,  [r3]              // next_bits
298        rsb             r5,  r6,  #16
299        add             r4,  r6,  #16          // shift_bits = cnt + 16
300        mvn             r8,  r8
301        lsr             r5,  r5,  #3           // num_bytes_read
302        rev             r8,  r8                // next_bits = bswap(next_bits)
303        lsr             r8,  r8,  r4           // next_bits >>= shift_bits
304
3052:      // refill_end
306        add             r3,  r3,  r5
307        add             r6,  r6,  r5, lsl #3   // cnt += num_bits_read
308        str             r3,  [r0, #BUF_POS]
309
3103:      // refill_end2
311        orr             r7,  r7,  r8           // dif |= next_bits
312
3134:      // end
314        str             r6,  [r0, #CNT]
315        str             r7,  [r0, #DIF]
316        mov             r0,  lr
317        add             sp,  sp,  #48
318        pop             {r4-r10,pc}
319
3205:      // pad_with_ones
321        add             r8,  r6,  #-240
322        lsr             r8,  r8,  r8
323        b               3b
324
3256:      // refill_eob
326        cmp             r3,  r4
327        bhs             5b
328
329        ldr             r8,  [r4, #-4]
330        lsl             r5,  r5,  #3
331        lsr             r8,  r8,  r5
332        add             r5,  r6,  #16
333        mvn             r8,  r8
334        sub             r4,  r4,  r3           // num_bytes_left
335        rev             r8,  r8
336        lsr             r8,  r8,  r5
337        rsb             r5,  r6,  #16
338        lsr             r5,  r5,  #3
339        cmp             r5,  r4
340        it              hs
341        movhs           r5,  r4
342        b               2b
343endfunc
344
345function msac_decode_symbol_adapt8_neon, export=1
346        decode_update   8
347        b               L(renorm)
348endfunc
349
350function msac_decode_symbol_adapt16_neon, export=1
351        decode_update   16
352        b               L(renorm)
353endfunc
354
355function msac_decode_hi_tok_neon, export=1
356        push            {r4-r10,lr}
357        vld1.16         {d0},  [r1, :64]       // cdf
358        add             r4,  r0,  #RNG
359        vmov.i16        d31, #0x7f00           // 0x7f00
360        movrel_local    r5,  coeffs, 30-2*3
361        vmvn.i16        d30, #0x3f             // 0xffc0
362        ldrh            r9,  [r1, #6]          // count = cdf[n_symbols]
363        vld1.16         {d1[]},  [r4, :16]     // rng
364        movrel_local    r4,  bits
365        vld1.16         {d29}, [r5]            // EC_MIN_PROB * (n_symbols - ret)
366        add             r5,  r0,  #DIF + 2
367        vld1.16         {q8}, [r4, :128]
368        mov             r2,  #-24
369        vand            d20, d0, d30           // cdf & 0xffc0
370        ldr             r10, [r0, #ALLOW_UPDATE_CDF]
371        vld1.16         {d2[]}, [r5, :16]      // dif >> (EC_WIN_SIZE - 16)
372        sub             sp,  sp,  #48
373        ldr             r6,  [r0, #CNT]
374        ldr             r7,  [r0, #DIF]
375        vmov            d3,  d2
3761:
377        vand            d23, d1,  d31          // rng & 0x7f00
378        vqdmulh.s16     d18, d20, d23          // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
379        add             r12, sp,  #14
380        vadd.i16        d6,  d20, d29          // v = cdf + EC_MIN_PROB * (n_symbols - ret)
381        vadd.i16        d6,  d18, d6           // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
382        vmov.i16        d7,  #0
383        vst1.16         {d1[0]}, [r12, :16]    // store original u = s->rng
384        add             r12, sp,  #16
385        vcge.u16        q2,  q1,  q3           // c >= v
386        vst1.16         {q3},  [r12]           // store v values to allow indexed access
387        vand            q9,  q2,  q8           // One bit per halfword set in the mask
388
389        vadd.i16        d18, d18, d19          // Aggregate mask bits
390        vpadd.i16       d18, d18, d18
391        vpadd.i16       d18, d18, d18
392        vmov.u16        r3,  d18[0]
393        cmp             r10, #0
394        add             r2,  r2,  #5
395        rbit            r3,  r3
396        add             r8,  sp,  #16
397        clz             lr,  r3                // ret
398
399        beq             2f
400        // update_cdf
401        vmov.i8         d22, #0xff
402        mov             r4,  #-5
403        vrhadd.u16      d6,  d22, d4           // i >= val ? -1 : 32768
404        sub             r4,  r4,  r9, lsr #4   // -((count >> 4) + 5)
405        vsub.i16        d6,  d6,  d0           // (32768 - cdf[i]) or (-1 - cdf[i])
406        vdup.16         d18, r4                // -rate
407
408        sub             r9,  r9,  r9, lsr #5   // count - (count == 32)
409        vsub.i16        d0,  d0,  d4           // cdf + (i >= val ? 1 : 0)
410        vshl.s16        d6,  d6,  d18          // ({32768,-1} - cdf[i]) >> rate
411        add             r9,  r9,  #1           // count + (count < 32)
412        vadd.i16        d0,  d0,  d6           // cdf + (32768 - cdf[i]) >> rate
413        vst1.16         {d0},  [r1, :64]
414        vand            d20, d0,  d30          // cdf & 0xffc0
415        strh            r9,  [r1, #6]
416
4172:
418        add             r8,  r8,  lr, lsl #1
419        ldrh            r3,  [r8]              // v
420        ldrh            r4,  [r8, #-2]         // u
421        sub             r4,  r4,  r3           // rng = u - v
422        clz             r5,  r4                // clz(rng)
423        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
424        sub             r7,  r7,  r3, lsl #16  // dif - (v << 16)
425        lsl             r4,  r4,  r5           // rng << d
426        subs            r6,  r6,  r5           // cnt -= d
427        lsl             r7,  r7,  r5           // (dif - (v << 16)) << d
428        str             r4,  [r0, #RNG]
429        vdup.16         d1,  r4
430        bhs             5f
431
432        // refill
433        ldr             r3,  [r0, #BUF_POS]    // BUF_POS
434        ldr             r4,  [r0, #BUF_END]    // BUF_END
435        add             r5,  r3,  #4
436        subs            r5,  r5,  r4
437        bhi             7f
438
439        ldr             r8,  [r3]              // next_bits
440        rsb             r5,  r6,  #16
441        add             r4,  r6,  #16          // shift_bits = cnt + 16
442        mvn             r8,  r8
443        lsr             r5,  r5,  #3           // num_bytes_read
444        rev             r8,  r8                // next_bits = bswap(next_bits)
445        lsr             r8,  r8,  r4           // next_bits >>= shift_bits
446
4473:      // refill_end
448        add             r3,  r3,  r5
449        add             r6,  r6,  r5, lsl #3   // cnt += num_bits_read
450        str             r3,  [r0, #BUF_POS]
451
4524:      // refill_end2
453        orr             r7,  r7,  r8           // dif |= next_bits
454
4555:      // end
456        lsl             lr,  lr,  #1
457        sub             lr,  lr,  #5
458        lsr             r12, r7,  #16
459        adds            r2,  r2,  lr           // carry = tok_br < 3 || tok == 15
460        vdup.16         q1,  r12
461        bcc             1b                     // loop if !carry
462        add             r2,  r2,  #30
463        str             r6,  [r0, #CNT]
464        add             sp,  sp,  #48
465        str             r7,  [r0, #DIF]
466        lsr             r0,  r2,  #1
467        pop             {r4-r10,pc}
468
4696:      // pad_with_ones
470        add             r8,  r6,  #-240
471        lsr             r8,  r8,  r8
472        b               4b
473
4747:      // refill_eob
475        cmp             r3,  r4
476        bhs             6b
477
478        ldr             r8,  [r4, #-4]
479        lsl             r5,  r5,  #3
480        lsr             r8,  r8,  r5
481        add             r5,  r6,  #16
482        mvn             r8,  r8
483        sub             r4,  r4,  r3           // num_bytes_left
484        rev             r8,  r8
485        lsr             r8,  r8,  r5
486        rsb             r5,  r6,  #16
487        lsr             r5,  r5,  #3
488        cmp             r5,  r4
489        it              hs
490        movhs           r5,  r4
491        b               3b
492endfunc
493
494function msac_decode_bool_equi_neon, export=1
495        push            {r4-r10,lr}
496        ldr             r5,  [r0, #RNG]
497        ldr             r6,  [r0, #CNT]
498        sub             sp,  sp,  #48
499        ldr             r7,  [r0, #DIF]
500        bic             r4,  r5,  #0xff        // r &= 0xff00
501        add             r4,  r4,  #8
502        mov             r2,  #0
503        subs            r8,  r7,  r4, lsl #15  // dif - vw
504        lsr             r4,  r4,  #1           // v
505        sub             r5,  r5,  r4           // r - v
506        itee            lo
507        movlo           r2,  #1
508        movhs           r4,  r5                // if (ret) v = r - v;
509        movhs           r7,  r8                // if (ret) dif = dif - vw;
510
511        clz             r5,  r4                // clz(rng)
512        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
513        mov             lr,  r2
514        b               L(renorm2)
515endfunc
516
517function msac_decode_bool_neon, export=1
518        push            {r4-r10,lr}
519        ldr             r5,  [r0, #RNG]
520        ldr             r6,  [r0, #CNT]
521        sub             sp,  sp,  #48
522        ldr             r7,  [r0, #DIF]
523        lsr             r4,  r5,  #8           // r >> 8
524        bic             r1,  r1,  #0x3f        // f &= ~63
525        mul             r4,  r4,  r1
526        mov             r2,  #0
527        lsr             r4,  r4,  #7
528        add             r4,  r4,  #4           // v
529        subs            r8,  r7,  r4, lsl #16  // dif - vw
530        sub             r5,  r5,  r4           // r - v
531        itee            lo
532        movlo           r2,  #1
533        movhs           r4,  r5                // if (ret) v = r - v;
534        movhs           r7,  r8                // if (ret) dif = dif - vw;
535
536        clz             r5,  r4                // clz(rng)
537        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
538        mov             lr,  r2
539        b               L(renorm2)
540endfunc
541
542function msac_decode_bool_adapt_neon, export=1
543        push            {r4-r10,lr}
544        ldr             r9,  [r1]              // cdf[0-1]
545        ldr             r5,  [r0, #RNG]
546        movw            lr,  #0xffc0
547        ldr             r6,  [r0, #CNT]
548        sub             sp,  sp,  #48
549        ldr             r7,  [r0, #DIF]
550        lsr             r4,  r5,  #8           // r >> 8
551        and             r2,  r9,  lr           // f &= ~63
552        mul             r4,  r4,  r2
553        mov             r2,  #0
554        lsr             r4,  r4,  #7
555        add             r4,  r4,  #4           // v
556        subs            r8,  r7,  r4, lsl #16  // dif - vw
557        sub             r5,  r5,  r4           // r - v
558        ldr             r10, [r0, #ALLOW_UPDATE_CDF]
559        itee            lo
560        movlo           r2,  #1
561        movhs           r4,  r5                // if (ret) v = r - v;
562        movhs           r7,  r8                // if (ret) dif = dif - vw;
563
564        cmp             r10, #0
565        clz             r5,  r4                // clz(rng)
566        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
567        mov             lr,  r2
568
569        beq             L(renorm2)
570
571        lsr             r2,  r9,  #16          // count = cdf[1]
572        uxth            r9,  r9                // cdf[0]
573
574        sub             r3,  r2,  r2,  lsr #5  // count - (count >= 32)
575        lsr             r2,  r2,  #4           // count >> 4
576        add             r10, r3,  #1           // count + (count < 32)
577        add             r2,  r2,  #4           // rate = (count >> 4) | 4
578
579        sub             r9,  r9,  lr           // cdf[0] -= bit
580        sub             r3,  r9,  lr,  lsl #15 // {cdf[0], cdf[0] - 32769}
581        asr             r3,  r3,  r2           // {cdf[0], cdf[0] - 32769} >> rate
582        sub             r9,  r9,  r3           // cdf[0]
583
584        strh            r9,  [r1]
585        strh            r10, [r1, #2]
586
587        b               L(renorm2)
588endfunc
589