1/* 2 * Copyright © 2019, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31#define BUF_POS 0 32#define BUF_END 8 33#define DIF 16 34#define RNG 24 35#define CNT 28 36#define ALLOW_UPDATE_CDF 32 37 38#define COEFFS_BASE_OFFSET 30 39#define MASKS8_OFFSET (64-COEFFS_BASE_OFFSET) 40 41const coeffs 42 .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 43 .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 44 // masks8 45 .short -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, 0xF0E 46endconst 47 48.macro ld1_n d0, d1, src, sz, n 49.if \n <= 8 50 ld1 {\d0\sz}, [\src] 51.else 52 ld1 {\d0\sz, \d1\sz}, [\src] 53.endif 54.endm 55 56.macro st1_n s0, s1, dst, sz, n 57.if \n <= 8 58 st1 {\s0\sz}, [\dst] 59.else 60 st1 {\s0\sz, \s1\sz}, [\dst] 61.endif 62.endm 63 64.macro ushr_n d0, d1, s0, s1, shift, sz, n 65 ushr \d0\sz, \s0\sz, \shift 66.if \n == 16 67 ushr \d1\sz, \s1\sz, \shift 68.endif 69.endm 70 71.macro add_n d0, d1, s0, s1, s2, s3, sz, n 72 add \d0\sz, \s0\sz, \s2\sz 73.if \n == 16 74 add \d1\sz, \s1\sz, \s3\sz 75.endif 76.endm 77 78.macro sub_n d0, d1, s0, s1, s2, s3, sz, n 79 sub \d0\sz, \s0\sz, \s2\sz 80.if \n == 16 81 sub \d1\sz, \s1\sz, \s3\sz 82.endif 83.endm 84 85.macro and_n d0, d1, s0, s1, s2, s3, sz, n 86 and \d0\sz, \s0\sz, \s2\sz 87.if \n == 16 88 and \d1\sz, \s1\sz, \s3\sz 89.endif 90.endm 91 92.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n 93 cmhs \d0\sz, \s0\sz, \s2\sz 94.if \n == 16 95 cmhs \d1\sz, \s1\sz, \s3\sz 96.endif 97.endm 98 99.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n 100 sshl \d0\sz, \s0\sz, \s2\sz 101.if \n == 16 102 sshl \d1\sz, \s1\sz, \s3\sz 103.endif 104.endm 105 106.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n 107 sqdmulh \d0\sz, \s0\sz, \s2\sz 108.if \n == 16 109 sqdmulh \d1\sz, \s1\sz, \s3\sz 110.endif 111.endm 112 113.macro str_n idx0, idx1, dstreg, dstoff, n 114 str \idx0, [\dstreg, \dstoff] 115.if \n == 16 116 str \idx1, [\dstreg, \dstoff + 16] 117.endif 118.endm 119 120// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, 121// size_t n_symbols); 122 123function msac_decode_symbol_adapt4_neon, export=1 124.macro decode_update sz, szb, n 125.if \n == 16 126 sub sp, sp, #48 127.endif 128 add x8, x0, #RNG 129 ld1_n v0, v1, x1, \sz, \n // cdf 130 ld1r {v29\sz}, [x8] // rng 131 movrel x9, coeffs, COEFFS_BASE_OFFSET 132 movi v31\sz, #0x7f, lsl #8 // 0x7f00 133 sub x10, x9, x2, lsl #1 134 mvni v30\sz, #0x3f // 0xffc0 135 and v7\szb, v29\szb, v31\szb // rng & 0x7f00 136.if \n == 16 137 str h29, [sp, #14] // store original u = s->rng 138.endif 139 and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0 140 141 ld1_n v4, v5, x10, \sz, \n // EC_MIN_PROB * (n_symbols - ret) 142 sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 143 ldr d28, [x0, #DIF] 144 145 add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) 146 add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) 147 148 dup v30\sz, v28.h[3] // dif >> (EC_WIN_SIZE - 16) 149.if \n == 8 150 ldur q31, [x9, #MASKS8_OFFSET] 151.elseif \n == 16 152 str_n q4, q5, sp, #16, \n // store v values to allow indexed access 153.endif 154 155 // After the condition starts being true it continues, such that the vector looks like: 156 // 0, 0, 0 ... -1, -1 157 cmhs_n v2, v3, v30, v30, v4, v5, \sz, \n // c >= v 158.if \n == 4 159 ext v29\szb, v29\szb, v4\szb, #6 // u 160 umov x15, v2.d[0] 161 ldr w4, [x0, #ALLOW_UPDATE_CDF] 162 rev x15, x15 163 sub v29\sz, v29\sz, v4\sz // rng = u-v 164 // rev + clz = count trailing zeros 165 clz x15, x15 // 16*ret 166.elseif \n == 8 167 // The final short of the compare is always set. 168 // Using addv, subtract -0x202*ret from this value to create a lookup table for a short. 169 // For n == 8: 170 // -0x202 + -0x202 + ... + 0xF0E 171 // (0x202*7) | (1 << 8) 172 // ^-------offset for second byte of the short 173 and v31\szb, v31\szb, v2\szb 174 ext v29\szb, v29\szb, v4\szb, #14 // u 175 addv h31, v31\sz // ((2*ret + 1) << 8) | (2*ret) 176 ldr w4, [x0, #ALLOW_UPDATE_CDF] 177 sub v30\sz, v30\sz, v4\sz // (dif >> 48) - v 178 smov w15, v31.b[0] // 2*ret 179 sub v29\sz, v29\sz, v4\sz // rng = u-v 180.elseif \n == 16 181 add v6\sz, v2\sz, v3\sz 182 addv h31, v6\sz // -n + ret 183 ldr w4, [x0, #ALLOW_UPDATE_CDF] 184 smov w15, v31.h[0] 185.endif 186 187 cbz w4, 0f 188 189 // update_cdf 190 ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols] 191.if \n == 16 192 // 16 case has a lower bound that guarantees n_symbols > 2 193 mov w4, #-5 194.elseif \n == 8 195 mvn w14, w2 196 mov w4, #-4 197 cmn w14, #3 // set C if n_symbols <= 2 198.else 199 // if n_symbols < 4 (or < 6 even) then 200 // (1 + n_symbols) >> 2 == n_symbols > 2 201 add w14, w2, #17 // (1 + n_symbols) + (4 << 2) 202.endif 203 sub_n v16, v17, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0) 204 orr v2\sz, #0x80, lsl #8 205.if \n == 16 206 orr v3\sz, #0x80, lsl #8 207.endif 208.if \n == 16 209 sub w4, w4, w3, lsr #4 // -((count >> 4) + 5) 210.elseif \n == 8 211 lsr w14, w3, #4 // count >> 4 212 sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4) 213.else 214 neg w4, w14, lsr #2 // -((n_symbols > 2) + 4) 215 sub w4, w4, w3, lsr #4 // -((count >> 4) + (n_symbols > 2) + 4) 216.endif 217 sub_n v2, v3, v2, v3, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i]) 218 dup v6\sz, w4 // -rate 219 220 sub w3, w3, w3, lsr #5 // count - (count == 32) 221 sshl_n v2, v3, v2, v3, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate 222 add w3, w3, #1 // count + (count < 32) 223 add_n v0, v1, v16, v17, v2, v3, \sz, \n // cdf + (32768 - cdf[i]) >> rate 224 st1_n v0, v1, x1, \sz, \n 225 strh w3, [x1, x2, lsl #1] 226 2270: 228 // renorm 229.if \n == 4 230 ldr w6, [x0, #CNT] 231 ldr x7, [x0, #DIF] 232 mov x4, v29.d[0] // rng (packed) 233 mov x3, v4.d[0] // v (packed) 234 235 // Shift 'v'/'rng' for ret into the 16 least sig bits. There is 236 // garbage in the remaining bits, but we can work around this. 237 lsr x4, x4, x15 // rng 238 lsr x3, x3, x15 // v 239 lsl w5, w4, #16 // rng << 16 240 sub x7, x7, x3, lsl #48 // dif - (v << 48) 241 clz w5, w5 // d = clz(rng << 16) 242 lsl w4, w4, w5 // rng << d 243 subs w6, w6, w5 // cnt -= d 244 lsl x7, x7, x5 // (dif - (v << 48)) << d 245 strh w4, [x0, #RNG] 246 b.lo 1f 247 str w6, [x0, #CNT] 248 str x7, [x0, #DIF] 249 lsr w0, w15, #4 250 ret 2511: 252 lsr w15, w15, #4 253 b L(refill) 254.elseif \n == 8 255 ldr w6, [x0, #CNT] 256 tbl v30.8b, {v30.16b}, v31.8b 257 tbl v29.8b, {v29.16b}, v31.8b 258 ins v28.h[3], v30.h[0] // dif - (v << 48) 259 clz v0.4h, v29.4h // d = clz(rng) 260 umov w5, v0.h[0] 261 ushl v29.4h, v29.4h, v0.4h // rng << d 262 263 // The vec for clz(rng) is filled with garbage after the first short, 264 // but ushl/sshl conveniently uses only the first byte for the shift 265 // amount. 266 ushl d28, d28, d0 // (dif - (v << 48)) << d 267 268 subs w6, w6, w5 // cnt -= d 269 str h29, [x0, #RNG] 270 b.lo 1f 271 str w6, [x0, #CNT] 272 str d28, [x0, #DIF] 273 lsr w0, w15, #1 // ret 274 ret 2751: 276 lsr w15, w15, #1 // ret 277 mov x7, v28.d[0] 278 b L(refill) 279.elseif \n == 16 280 add x8, sp, w15, sxtw #1 281 ldrh w3, [x8, #48] // v 282 ldurh w4, [x8, #46] // u 283 ldr w6, [x0, #CNT] 284 ldr x7, [x0, #DIF] 285 sub w4, w4, w3 // rng = u - v 286 clz w5, w4 // clz(rng) 287 eor w5, w5, #16 // d = clz(rng) ^ 16 288 sub x7, x7, x3, lsl #48 // dif - (v << 48) 289 lsl w4, w4, w5 // rng << d 290 subs w6, w6, w5 // cnt -= d 291 lsl x7, x7, x5 // (dif - (v << 48)) << d 292 str w4, [x0, #RNG] 293 add sp, sp, #48 294 b.lo 1f 295 str w6, [x0, #CNT] 296 str x7, [x0, #DIF] 297 add w0, w15, #\n // ret 298 ret 2991: 300 add w15, w15, #\n // ret 301 b L(refill) 302.endif 303.endm 304 305 decode_update .4h, .8b, 4 306 307L(refill): 308 // refill 309 ldp x3, x4, [x0] // BUF_POS, BUF_END 310 add x5, x3, #8 311 subs x5, x5, x4 312 b.hi 6f 313 314 ldr x8, [x3] // next_bits 315 add w4, w6, #-48 // shift_bits = cnt + 16 (- 64) 316 mvn x8, x8 317 neg w5, w4 318 rev x8, x8 // next_bits = bswap(next_bits) 319 lsr w5, w5, #3 // num_bytes_read 320 lsr x8, x8, x4 // next_bits >>= (shift_bits & 63) 321 3222: // refill_end 323 add x3, x3, x5 324 add w6, w6, w5, lsl #3 // cnt += num_bits_read 325 str x3, [x0, #BUF_POS] 326 3273: // refill_end2 328 orr x7, x7, x8 // dif |= next_bits 329 3304: // end 331 str w6, [x0, #CNT] 332 str x7, [x0, #DIF] 333 334 mov w0, w15 335 ret 336 3375: // pad_with_ones 338 add w8, w6, #-16 339 ror x8, x8, x8 340 b 3b 341 3426: // refill_eob 343 cmp x3, x4 344 b.hs 5b 345 346 ldr x8, [x4, #-8] 347 lsl w5, w5, #3 348 lsr x8, x8, x5 349 add w5, w6, #-48 350 mvn x8, x8 351 sub w4, w4, w3 // num_bytes_left 352 rev x8, x8 353 lsr x8, x8, x5 354 neg w5, w5 355 lsr w5, w5, #3 356 cmp w5, w4 357 csel w5, w5, w4, lo // num_bytes_read 358 b 2b 359endfunc 360 361function msac_decode_symbol_adapt8_neon, export=1 362 decode_update .8h, .16b, 8 363endfunc 364 365function msac_decode_symbol_adapt16_neon, export=1 366 decode_update .8h, .16b, 16 367endfunc 368 369function msac_decode_hi_tok_neon, export=1 370 ld1 {v0.4h}, [x1] // cdf 371 add x16, x0, #RNG 372 movi v31.4h, #0x7f, lsl #8 // 0x7f00 373 movrel x17, coeffs, COEFFS_BASE_OFFSET-2*3 374 mvni v30.4h, #0x3f // 0xffc0 375 ldrh w9, [x1, #6] // count = cdf[n_symbols] 376 ld1r {v3.4h}, [x16] // rng 377 ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret) 378 add x17, x0, #DIF + 6 379 mov w13, #-24*8 380 and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 381 ldr w10, [x0, #ALLOW_UPDATE_CDF] 382 ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16) 383 ldr w6, [x0, #CNT] 384 ldr x7, [x0, #DIF] 3851: 386 and v7.8b, v3.8b, v31.8b // rng & 0x7f00 387 sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 388 add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret) 389 add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) 390 cmhs v2.4h, v1.4h, v4.4h // c >= v 391 add w13, w13, #5*8 392 ext v18.8b, v3.8b, v4.8b, #6 // u 393 umov x15, v2.d[0] 394 rev x15, x15 395 sub v18.4h, v18.4h, v4.4h // rng = u-v 396 // rev + clz = count trailing zeros 397 clz x15, x15 // 16*ret 398 399 cbz w10, 2f 400 // update_cdf 401 sub v5.4h, v0.4h, v2.4h // cdf[i] + (i >= val ? 1 : 0) 402 mov w4, #-5 403 orr v2.4h, #0x80, lsl #8 // i >= val ? -1 : 32768 404 sub w4, w4, w9, lsr #4 // -((count >> 4) + 5) 405 sub v2.4h, v2.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) 406 dup v6.4h, w4 // -rate 407 408 sub w9, w9, w9, lsr #5 // count - (count == 32) 409 sshl v2.4h, v2.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate 410 add w9, w9, #1 // count + (count < 32) 411 add v0.4h, v5.4h, v2.4h // cdf[i] + (32768 - cdf[i]) >> rate 412 st1 {v0.4h}, [x1] 413 and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 414 strh w9, [x1, #6] 415 4162: 417 mov x4, v18.d[0] // rng (packed) 418 mov x3, v4.d[0] // v (packed) 419 420 // Shift 'v'/'rng' for ret into the 16 least sig bits. There is 421 // garbage in the remaining bits, but we can work around this. 422 lsr x4, x4, x15 // rng 423 lsr x3, x3, x15 // v 424 lsl w5, w4, #16 // rng << 16 425 sub x7, x7, x3, lsl #48 // dif - (v << 48) 426 clz w5, w5 // d = clz(rng << 16) 427 lsl w4, w4, w5 // rng << d 428 subs w6, w6, w5 // cnt -= d 429 lsl x7, x7, x5 // (dif - (v << 48)) << d 430 strh w4, [x0, #RNG] 431 dup v3.4h, w4 432 b.hs 5f 433 434 // refill 435 ldp x3, x4, [x0] // BUF_POS, BUF_END 436 add x5, x3, #8 437 subs x5, x5, x4 438 b.hi 7f 439 440 ldr x8, [x3] // next_bits 441 add w4, w6, #-48 // shift_bits = cnt + 16 (- 64) 442 mvn x8, x8 443 neg w5, w4 444 rev x8, x8 // next_bits = bswap(next_bits) 445 lsr w5, w5, #3 // num_bytes_read 446 lsr x8, x8, x4 // next_bits >>= (shift_bits & 63) 447 4483: // refill_end 449 add x3, x3, x5 450 add w6, w6, w5, lsl #3 // cnt += num_bits_read 451 str x3, [x0, #BUF_POS] 452 4534: // refill_end2 454 orr x7, x7, x8 // dif |= next_bits 455 4565: // end 457 sub w15, w15, #5*8 458 lsr x12, x7, #48 459 adds w13, w13, w15 // carry = tok_br < 3 || tok == 15 460 dup v1.8h, w12 461 b.cc 1b // loop if !carry 462 add w13, w13, #30*8 463 str w6, [x0, #CNT] 464 str x7, [x0, #DIF] 465 lsr w0, w13, #4 466 ret 467 4686: // pad_with_ones 469 add w8, w6, #-16 470 ror x8, x8, x8 471 b 4b 472 4737: // refill_eob 474 cmp x3, x4 475 b.hs 6b 476 477 ldr x8, [x4, #-8] 478 lsl w5, w5, #3 479 lsr x8, x8, x5 480 add w5, w6, #-48 481 mvn x8, x8 482 sub w4, w4, w3 // num_bytes_left 483 rev x8, x8 484 lsr x8, x8, x5 485 neg w5, w5 486 lsr w5, w5, #3 487 cmp w5, w4 488 csel w5, w5, w4, lo // num_bytes_read 489 b 3b 490endfunc 491 492function msac_decode_bool_equi_neon, export=1 493 ldp w5, w6, [x0, #RNG] // + CNT 494 ldr x7, [x0, #DIF] 495 bic w4, w5, #0xff // r &= 0xff00 496 add w4, w4, #8 497 subs x8, x7, x4, lsl #47 // dif - vw 498 lsr w4, w4, #1 // v 499 sub w5, w5, w4 // r - v 500 cset w15, lo 501 csel w4, w5, w4, hs // if (ret) v = r - v; 502 csel x7, x8, x7, hs // if (ret) dif = dif - vw; 503 504 clz w5, w4 // clz(rng) 505 eor w5, w5, #16 // d = clz(rng) ^ 16 506 lsl w4, w4, w5 // rng << d 507 subs w6, w6, w5 // cnt -= d 508 lsl x7, x7, x5 // (dif - (v << 48)) << d 509 str w4, [x0, #RNG] 510 b.lo L(refill) 511 512 str w6, [x0, #CNT] 513 str x7, [x0, #DIF] 514 mov w0, w15 515 ret 516endfunc 517 518function msac_decode_bool_neon, export=1 519 ldp w5, w6, [x0, #RNG] // + CNT 520 ldr x7, [x0, #DIF] 521 lsr w4, w5, #8 // r >> 8 522 bic w1, w1, #0x3f // f &= ~63 523 mul w4, w4, w1 524 lsr w4, w4, #7 525 add w4, w4, #4 // v 526 subs x8, x7, x4, lsl #48 // dif - vw 527 sub w5, w5, w4 // r - v 528 cset w15, lo 529 csel w4, w5, w4, hs // if (ret) v = r - v; 530 csel x7, x8, x7, hs // if (ret) dif = dif - vw; 531 532 clz w5, w4 // clz(rng) 533 eor w5, w5, #16 // d = clz(rng) ^ 16 534 lsl w4, w4, w5 // rng << d 535 subs w6, w6, w5 // cnt -= d 536 lsl x7, x7, x5 // (dif - (v << 48)) << d 537 str w4, [x0, #RNG] 538 b.lo L(refill) 539 540 str w6, [x0, #CNT] 541 str x7, [x0, #DIF] 542 mov w0, w15 543 ret 544endfunc 545 546function msac_decode_bool_adapt_neon, export=1 547 ldr w9, [x1] // cdf[0-1] 548 ldp w5, w6, [x0, #RNG] // + CNT 549 ldr x7, [x0, #DIF] 550 lsr w4, w5, #8 // r >> 8 551 and w2, w9, #0xffc0 // f &= ~63 552 mul w4, w4, w2 553 lsr w4, w4, #7 554 add w4, w4, #4 // v 555 subs x8, x7, x4, lsl #48 // dif - vw 556 sub w5, w5, w4 // r - v 557 cset w15, lo 558 csel w4, w5, w4, hs // if (ret) v = r - v; 559 csel x7, x8, x7, hs // if (ret) dif = dif - vw; 560 561 ldr w10, [x0, #ALLOW_UPDATE_CDF] 562 563 clz w5, w4 // clz(rng) 564 eor w5, w5, #16 // d = clz(rng) ^ 16 565 566 cbz w10, 1f 567 568 lsr w2, w9, #16 // count = cdf[1] 569 and w9, w9, #0xffff // cdf[0] 570 571 sub w3, w2, w2, lsr #5 // count - (count >= 32) 572 lsr w2, w2, #4 // count >> 4 573 add w10, w3, #1 // count + (count < 32) 574 add w2, w2, #4 // rate = (count >> 4) | 4 575 576 sub w9, w9, w15 // cdf[0] -= bit 577 sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769} 578 asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate 579 sub w9, w9, w11 // cdf[0] 580 581 strh w9, [x1] 582 strh w10, [x1, #2] 583 5841: 585 lsl w4, w4, w5 // rng << d 586 subs w6, w6, w5 // cnt -= d 587 lsl x7, x7, x5 // (dif - (v << 48)) << d 588 str w4, [x0, #RNG] 589 b.lo L(refill) 590 591 str w6, [x0, #CNT] 592 str x7, [x0, #DIF] 593 mov w0, w15 594 ret 595endfunc 596