1/* 2 * Copyright © 2019, VideoLAN and dav1d authors 3 * Copyright © 2020, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31#define BUF_POS 0 32#define BUF_END 4 33#define DIF 8 34#define RNG 12 35#define CNT 16 36#define ALLOW_UPDATE_CDF 20 37 38const coeffs 39 .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 40 .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 41endconst 42 43const bits, align=4 44 .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 45 .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000 46endconst 47 48.macro vld1_align_n d0, q0, q1, src, n 49.if \n == 4 50 vld1.16 {\d0}, [\src, :64] 51.elseif \n == 8 52 vld1.16 {\q0}, [\src, :128] 53.else 54 vld1.16 {\q0, \q1}, [\src, :128] 55.endif 56.endm 57 58.macro vld1_n d0, q0, q1, src, n 59.if \n == 4 60 vld1.16 {\d0}, [\src] 61.elseif \n == 8 62 vld1.16 {\q0}, [\src] 63.else 64 vld1.16 {\q0, \q1}, [\src] 65.endif 66.endm 67 68.macro vst1_align_n d0, q0, q1, src, n 69.if \n == 4 70 vst1.16 {\d0}, [\src, :64] 71.elseif \n == 8 72 vst1.16 {\q0}, [\src, :128] 73.else 74 vst1.16 {\q0, \q1}, [\src, :128] 75.endif 76.endm 77 78.macro vst1_n d0, q0, q1, src, n 79.if \n == 4 80 vst1.16 {\d0}, [\src] 81.elseif \n == 8 82 vst1.16 {\q0}, [\src] 83.else 84 vst1.16 {\q0, \q1}, [\src] 85.endif 86.endm 87 88.macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n 89.if \n == 4 90 vshr.u16 \d0, \s0, \s3 91.else 92 vshr.u16 \d1, \s1, \s4 93.if \n == 16 94 vshr.u16 \d2, \s2, \s5 95.endif 96.endif 97.endm 98 99.macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n 100.if \n == 4 101 vadd.i16 \d0, \s0, \s3 102.else 103 vadd.i16 \d1, \s1, \s4 104.if \n == 16 105 vadd.i16 \d2, \s2, \s5 106.endif 107.endif 108.endm 109 110.macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n 111.if \n == 4 112 vsub.i16 \d0, \s0, \s3 113.else 114 vsub.i16 \d1, \s1, \s4 115.if \n == 16 116 vsub.i16 \d2, \s2, \s5 117.endif 118.endif 119.endm 120 121.macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n 122.if \n == 4 123 vand \d0, \s0, \s3 124.else 125 vand \d1, \s1, \s4 126.if \n == 16 127 vand \d2, \s2, \s5 128.endif 129.endif 130.endm 131 132.macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n 133.if \n == 4 134 vcge.u16 \d0, \s0, \s3 135.else 136 vcge.u16 \d1, \s1, \s4 137.if \n == 16 138 vcge.u16 \d2, \s2, \s5 139.endif 140.endif 141.endm 142 143.macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n 144.if \n == 4 145 vrhadd.u16 \d0, \s0, \s3 146.else 147 vrhadd.u16 \d1, \s1, \s4 148.if \n == 16 149 vrhadd.u16 \d2, \s2, \s5 150.endif 151.endif 152.endm 153 154.macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n 155.if \n == 4 156 vshl.s16 \d0, \s0, \s3 157.else 158 vshl.s16 \d1, \s1, \s4 159.if \n == 16 160 vshl.s16 \d2, \s2, \s5 161.endif 162.endif 163.endm 164 165.macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n 166.if \n == 4 167 vqdmulh.s16 \d0, \s0, \s3 168.else 169 vqdmulh.s16 \d1, \s1, \s4 170.if \n == 16 171 vqdmulh.s16 \d2, \s2, \s5 172.endif 173.endif 174.endm 175 176// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, 177// size_t n_symbols); 178 179function msac_decode_symbol_adapt4_neon, export=1 180.macro decode_update n 181 push {r4-r10,lr} 182 sub sp, sp, #48 183 add r8, r0, #RNG 184 185 vld1_align_n d0, q0, q1, r1, \n // cdf 186 vld1.16 {d16[]}, [r8, :16] // rng 187 movrel_local r9, coeffs, 30 188 vmov.i16 d30, #0x7f00 // 0x7f00 189 sub r9, r9, r2, lsl #1 190 vmvn.i16 q14, #0x3f // 0xffc0 191 add r8, sp, #14 192 vand d22, d16, d30 // rng & 0x7f00 193 vst1.16 {d16[0]}, [r8, :16] // store original u = s->rng 194 vand_n d4, q2, q3, d0, q0, q1, d28, q14, q14, \n // cdf & 0xffc0 195.if \n > 4 196 vmov d23, d22 197.endif 198 199 vld1_n d16, q8, q9, r9, \n // EC_MIN_PROB * (n_symbols - ret) 200 vqdmulh_n d20, q10, q11, d4, q2, q3, d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 201 add r8, r0, #DIF + 2 202 203 vadd_n d16, q8, q9, d4, q2, q3, d16, q8, q9, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) 204.if \n == 4 205 vmov.i16 d17, #0 206.endif 207 vadd_n d16, q8, q9, d20, q10, q11, d16, q8, q9, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) 208 209 add r9, sp, #16 210 vld1.16 {d20[]}, [r8, :16] // dif >> (EC_WIN_SIZE - 16) 211 movrel_local r8, bits 212 vst1_n q8, q8, q9, r9, \n // store v values to allow indexed access 213 214 vmov d21, d20 215 vld1_align_n q12, q12, q13, r8, \n 216.if \n == 16 217 vmov q11, q10 218.endif 219 220 vcge_n q2, q2, q3, q10, q10, q11, q8, q8, q9, \n // c >= v 221 222 vand_n q10, q10, q11, q2, q2, q3, q12, q12, q13, \n // One bit per halfword set in the mask 223.if \n == 16 224 vadd.i16 q10, q10, q11 225.endif 226 vadd.i16 d20, d20, d21 // Aggregate mask bits 227 ldr r4, [r0, #ALLOW_UPDATE_CDF] 228 vpadd.i16 d20, d20, d20 229 lsl r10, r2, #1 230 vpadd.i16 d20, d20, d20 231 vmov.u16 r3, d20[0] 232 cmp r4, #0 233 rbit r3, r3 234 clz lr, r3 // ret 235 236 beq L(renorm) 237 // update_cdf 238 ldrh r3, [r1, r10] // count = cdf[n_symbols] 239 vmov.i8 q10, #0xff 240.if \n == 16 241 mov r4, #-5 242.else 243 mvn r12, r2 244 mov r4, #-4 245 cmn r12, #3 // set C if n_symbols <= 2 246.endif 247 vrhadd_n d16, q8, q9, d20, q10, q10, d4, q2, q3, \n // i >= val ? -1 : 32768 248.if \n == 16 249 sub r4, r4, r3, lsr #4 // -((count >> 4) + 5) 250.else 251 lsr r12, r3, #4 // count >> 4 252 sbc r4, r4, r12 // -((count >> 4) + (n_symbols > 2) + 4) 253.endif 254 vsub_n d16, q8, q9, d16, q8, q9, d0, q0, q1, \n // (32768 - cdf[i]) or (-1 - cdf[i]) 255.if \n == 4 256 vdup.16 d20, r4 // -rate 257.else 258 vdup.16 q10, r4 // -rate 259.endif 260 261 sub r3, r3, r3, lsr #5 // count - (count == 32) 262 vsub_n d0, q0, q1, d0, q0, q1, d4, q2, q3, \n // cdf + (i >= val ? 1 : 0) 263 vshl_n d16, q8, q9, d16, q8, q9, d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate 264 add r3, r3, #1 // count + (count < 32) 265 vadd_n d0, q0, q1, d0, q0, q1, d16, q8, q9, \n // cdf + (32768 - cdf[i]) >> rate 266 vst1_align_n d0, q0, q1, r1, \n 267 strh r3, [r1, r10] 268.endm 269 270 decode_update 4 271 272L(renorm): 273 add r8, sp, #16 274 add r8, r8, lr, lsl #1 275 ldrh r3, [r8] // v 276 ldrh r4, [r8, #-2] // u 277 ldr r6, [r0, #CNT] 278 ldr r7, [r0, #DIF] 279 sub r4, r4, r3 // rng = u - v 280 clz r5, r4 // clz(rng) 281 eor r5, r5, #16 // d = clz(rng) ^ 16 282 sub r7, r7, r3, lsl #16 // dif - (v << 16) 283L(renorm2): 284 lsl r4, r4, r5 // rng << d 285 subs r6, r6, r5 // cnt -= d 286 lsl r7, r7, r5 // (dif - (v << 16)) << d 287 str r4, [r0, #RNG] 288 bhs 4f 289 290 // refill 291 ldr r3, [r0, #BUF_POS] // BUF_POS 292 ldr r4, [r0, #BUF_END] // BUF_END 293 add r5, r3, #4 294 subs r5, r5, r4 295 bhi 6f 296 297 ldr r8, [r3] // next_bits 298 rsb r5, r6, #16 299 add r4, r6, #16 // shift_bits = cnt + 16 300 mvn r8, r8 301 lsr r5, r5, #3 // num_bytes_read 302 rev r8, r8 // next_bits = bswap(next_bits) 303 lsr r8, r8, r4 // next_bits >>= shift_bits 304 3052: // refill_end 306 add r3, r3, r5 307 add r6, r6, r5, lsl #3 // cnt += num_bits_read 308 str r3, [r0, #BUF_POS] 309 3103: // refill_end2 311 orr r7, r7, r8 // dif |= next_bits 312 3134: // end 314 str r6, [r0, #CNT] 315 str r7, [r0, #DIF] 316 mov r0, lr 317 add sp, sp, #48 318 pop {r4-r10,pc} 319 3205: // pad_with_ones 321 add r8, r6, #-240 322 lsr r8, r8, r8 323 b 3b 324 3256: // refill_eob 326 cmp r3, r4 327 bhs 5b 328 329 ldr r8, [r4, #-4] 330 lsl r5, r5, #3 331 lsr r8, r8, r5 332 add r5, r6, #16 333 mvn r8, r8 334 sub r4, r4, r3 // num_bytes_left 335 rev r8, r8 336 lsr r8, r8, r5 337 rsb r5, r6, #16 338 lsr r5, r5, #3 339 cmp r5, r4 340 it hs 341 movhs r5, r4 342 b 2b 343endfunc 344 345function msac_decode_symbol_adapt8_neon, export=1 346 decode_update 8 347 b L(renorm) 348endfunc 349 350function msac_decode_symbol_adapt16_neon, export=1 351 decode_update 16 352 b L(renorm) 353endfunc 354 355function msac_decode_hi_tok_neon, export=1 356 push {r4-r10,lr} 357 vld1.16 {d0}, [r1, :64] // cdf 358 add r4, r0, #RNG 359 vmov.i16 d31, #0x7f00 // 0x7f00 360 movrel_local r5, coeffs, 30-2*3 361 vmvn.i16 d30, #0x3f // 0xffc0 362 ldrh r9, [r1, #6] // count = cdf[n_symbols] 363 vld1.16 {d1[]}, [r4, :16] // rng 364 movrel_local r4, bits 365 vld1.16 {d29}, [r5] // EC_MIN_PROB * (n_symbols - ret) 366 add r5, r0, #DIF + 2 367 vld1.16 {q8}, [r4, :128] 368 mov r2, #-24 369 vand d20, d0, d30 // cdf & 0xffc0 370 ldr r10, [r0, #ALLOW_UPDATE_CDF] 371 vld1.16 {d2[]}, [r5, :16] // dif >> (EC_WIN_SIZE - 16) 372 sub sp, sp, #48 373 ldr r6, [r0, #CNT] 374 ldr r7, [r0, #DIF] 375 vmov d3, d2 3761: 377 vand d23, d1, d31 // rng & 0x7f00 378 vqdmulh.s16 d18, d20, d23 // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 379 add r12, sp, #14 380 vadd.i16 d6, d20, d29 // v = cdf + EC_MIN_PROB * (n_symbols - ret) 381 vadd.i16 d6, d18, d6 // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) 382 vmov.i16 d7, #0 383 vst1.16 {d1[0]}, [r12, :16] // store original u = s->rng 384 add r12, sp, #16 385 vcge.u16 q2, q1, q3 // c >= v 386 vst1.16 {q3}, [r12] // store v values to allow indexed access 387 vand q9, q2, q8 // One bit per halfword set in the mask 388 389 vadd.i16 d18, d18, d19 // Aggregate mask bits 390 vpadd.i16 d18, d18, d18 391 vpadd.i16 d18, d18, d18 392 vmov.u16 r3, d18[0] 393 cmp r10, #0 394 add r2, r2, #5 395 rbit r3, r3 396 add r8, sp, #16 397 clz lr, r3 // ret 398 399 beq 2f 400 // update_cdf 401 vmov.i8 d22, #0xff 402 mov r4, #-5 403 vrhadd.u16 d6, d22, d4 // i >= val ? -1 : 32768 404 sub r4, r4, r9, lsr #4 // -((count >> 4) + 5) 405 vsub.i16 d6, d6, d0 // (32768 - cdf[i]) or (-1 - cdf[i]) 406 vdup.16 d18, r4 // -rate 407 408 sub r9, r9, r9, lsr #5 // count - (count == 32) 409 vsub.i16 d0, d0, d4 // cdf + (i >= val ? 1 : 0) 410 vshl.s16 d6, d6, d18 // ({32768,-1} - cdf[i]) >> rate 411 add r9, r9, #1 // count + (count < 32) 412 vadd.i16 d0, d0, d6 // cdf + (32768 - cdf[i]) >> rate 413 vst1.16 {d0}, [r1, :64] 414 vand d20, d0, d30 // cdf & 0xffc0 415 strh r9, [r1, #6] 416 4172: 418 add r8, r8, lr, lsl #1 419 ldrh r3, [r8] // v 420 ldrh r4, [r8, #-2] // u 421 sub r4, r4, r3 // rng = u - v 422 clz r5, r4 // clz(rng) 423 eor r5, r5, #16 // d = clz(rng) ^ 16 424 sub r7, r7, r3, lsl #16 // dif - (v << 16) 425 lsl r4, r4, r5 // rng << d 426 subs r6, r6, r5 // cnt -= d 427 lsl r7, r7, r5 // (dif - (v << 16)) << d 428 str r4, [r0, #RNG] 429 vdup.16 d1, r4 430 bhs 5f 431 432 // refill 433 ldr r3, [r0, #BUF_POS] // BUF_POS 434 ldr r4, [r0, #BUF_END] // BUF_END 435 add r5, r3, #4 436 subs r5, r5, r4 437 bhi 7f 438 439 ldr r8, [r3] // next_bits 440 rsb r5, r6, #16 441 add r4, r6, #16 // shift_bits = cnt + 16 442 mvn r8, r8 443 lsr r5, r5, #3 // num_bytes_read 444 rev r8, r8 // next_bits = bswap(next_bits) 445 lsr r8, r8, r4 // next_bits >>= shift_bits 446 4473: // refill_end 448 add r3, r3, r5 449 add r6, r6, r5, lsl #3 // cnt += num_bits_read 450 str r3, [r0, #BUF_POS] 451 4524: // refill_end2 453 orr r7, r7, r8 // dif |= next_bits 454 4555: // end 456 lsl lr, lr, #1 457 sub lr, lr, #5 458 lsr r12, r7, #16 459 adds r2, r2, lr // carry = tok_br < 3 || tok == 15 460 vdup.16 q1, r12 461 bcc 1b // loop if !carry 462 add r2, r2, #30 463 str r6, [r0, #CNT] 464 add sp, sp, #48 465 str r7, [r0, #DIF] 466 lsr r0, r2, #1 467 pop {r4-r10,pc} 468 4696: // pad_with_ones 470 add r8, r6, #-240 471 lsr r8, r8, r8 472 b 4b 473 4747: // refill_eob 475 cmp r3, r4 476 bhs 6b 477 478 ldr r8, [r4, #-4] 479 lsl r5, r5, #3 480 lsr r8, r8, r5 481 add r5, r6, #16 482 mvn r8, r8 483 sub r4, r4, r3 // num_bytes_left 484 rev r8, r8 485 lsr r8, r8, r5 486 rsb r5, r6, #16 487 lsr r5, r5, #3 488 cmp r5, r4 489 it hs 490 movhs r5, r4 491 b 3b 492endfunc 493 494function msac_decode_bool_equi_neon, export=1 495 push {r4-r10,lr} 496 ldr r5, [r0, #RNG] 497 ldr r6, [r0, #CNT] 498 sub sp, sp, #48 499 ldr r7, [r0, #DIF] 500 bic r4, r5, #0xff // r &= 0xff00 501 add r4, r4, #8 502 mov r2, #0 503 subs r8, r7, r4, lsl #15 // dif - vw 504 lsr r4, r4, #1 // v 505 sub r5, r5, r4 // r - v 506 itee lo 507 movlo r2, #1 508 movhs r4, r5 // if (ret) v = r - v; 509 movhs r7, r8 // if (ret) dif = dif - vw; 510 511 clz r5, r4 // clz(rng) 512 eor r5, r5, #16 // d = clz(rng) ^ 16 513 mov lr, r2 514 b L(renorm2) 515endfunc 516 517function msac_decode_bool_neon, export=1 518 push {r4-r10,lr} 519 ldr r5, [r0, #RNG] 520 ldr r6, [r0, #CNT] 521 sub sp, sp, #48 522 ldr r7, [r0, #DIF] 523 lsr r4, r5, #8 // r >> 8 524 bic r1, r1, #0x3f // f &= ~63 525 mul r4, r4, r1 526 mov r2, #0 527 lsr r4, r4, #7 528 add r4, r4, #4 // v 529 subs r8, r7, r4, lsl #16 // dif - vw 530 sub r5, r5, r4 // r - v 531 itee lo 532 movlo r2, #1 533 movhs r4, r5 // if (ret) v = r - v; 534 movhs r7, r8 // if (ret) dif = dif - vw; 535 536 clz r5, r4 // clz(rng) 537 eor r5, r5, #16 // d = clz(rng) ^ 16 538 mov lr, r2 539 b L(renorm2) 540endfunc 541 542function msac_decode_bool_adapt_neon, export=1 543 push {r4-r10,lr} 544 ldr r9, [r1] // cdf[0-1] 545 ldr r5, [r0, #RNG] 546 movw lr, #0xffc0 547 ldr r6, [r0, #CNT] 548 sub sp, sp, #48 549 ldr r7, [r0, #DIF] 550 lsr r4, r5, #8 // r >> 8 551 and r2, r9, lr // f &= ~63 552 mul r4, r4, r2 553 mov r2, #0 554 lsr r4, r4, #7 555 add r4, r4, #4 // v 556 subs r8, r7, r4, lsl #16 // dif - vw 557 sub r5, r5, r4 // r - v 558 ldr r10, [r0, #ALLOW_UPDATE_CDF] 559 itee lo 560 movlo r2, #1 561 movhs r4, r5 // if (ret) v = r - v; 562 movhs r7, r8 // if (ret) dif = dif - vw; 563 564 cmp r10, #0 565 clz r5, r4 // clz(rng) 566 eor r5, r5, #16 // d = clz(rng) ^ 16 567 mov lr, r2 568 569 beq L(renorm2) 570 571 lsr r2, r9, #16 // count = cdf[1] 572 uxth r9, r9 // cdf[0] 573 574 sub r3, r2, r2, lsr #5 // count - (count >= 32) 575 lsr r2, r2, #4 // count >> 4 576 add r10, r3, #1 // count + (count < 32) 577 add r2, r2, #4 // rate = (count >> 4) | 4 578 579 sub r9, r9, lr // cdf[0] -= bit 580 sub r3, r9, lr, lsl #15 // {cdf[0], cdf[0] - 32769} 581 asr r3, r3, r2 // {cdf[0], cdf[0] - 32769} >> rate 582 sub r9, r9, r3 // cdf[0] 583 584 strh r9, [r1] 585 strh r10, [r1, #2] 586 587 b L(renorm2) 588endfunc 589